Libraries

library(leaps)
library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
library(tree)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(MASS)

Summary of Attributes

Matrix column entries (attributes): name - ASCII subject name and recording number MDVP:Fo(Hz) - Average vocal fundamental frequency MDVP:Fhi(Hz) - Maximum vocal fundamental frequency MDVP:Flo(Hz) - Minimum vocal fundamental frequency MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude NHR,HNR - Two measures of ratio of noise to tonal components in the voice status - Health status of the subject (one) - Parkinson’s, (zero) - healthy RPDE,D2 - Two nonlinear dynamical complexity measures DFA - Signal fractal scaling exponent spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation

Reading in the File

pd = read.csv("/Users/piachouaifaty/parkinson.csv")
length(which(pd$status==0))
## [1] 48

48 measurements for healthy patients

length(which(pd$status==1))
## [1] 147

147 measurements for PD patients

Averaging Repeated Measurements

pdavg = pd

Removing the recording number from the names to be able to match and average them

#substitutes the characters after the last underscore with a "", so removes them
pdavg$name = sub("_[^_]+$", "", pdavg$name)
head(pdavg)
##           name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 1 phon_R01_S01     119.992      157.302       74.997        0.00784
## 2 phon_R01_S01     122.400      148.650      113.819        0.00968
## 3 phon_R01_S01     116.682      131.111      111.555        0.01050
## 4 phon_R01_S01     116.676      137.871      111.366        0.00997
## 5 phon_R01_S01     116.014      141.781      110.655        0.01284
## 6 phon_R01_S01     120.552      131.162      113.787        0.00968
##   MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 1          0.00007  0.00370  0.00554    0.01109      0.04374            0.426
## 2          0.00008  0.00465  0.00696    0.01394      0.06134            0.626
## 3          0.00009  0.00544  0.00781    0.01633      0.05233            0.482
## 4          0.00009  0.00502  0.00698    0.01505      0.05492            0.517
## 5          0.00011  0.00655  0.00908    0.01966      0.06425            0.584
## 6          0.00008  0.00463  0.00750    0.01388      0.04701            0.456
##   Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA     NHR    HNR status     RPDE
## 1      0.02182      0.03130  0.02971     0.06545 0.02211 21.033      1 0.414783
## 2      0.03134      0.04518  0.04368     0.09403 0.01929 19.085      1 0.458359
## 3      0.02757      0.03858  0.03590     0.08270 0.01309 20.651      1 0.429895
## 4      0.02924      0.04005  0.03772     0.08771 0.01353 20.644      1 0.434969
## 5      0.03490      0.04825  0.04465     0.10470 0.01767 19.649      1 0.417356
## 6      0.02328      0.03526  0.03243     0.06985 0.01222 21.378      1 0.415564
##        DFA   spread1  spread2       D2      PPE
## 1 0.815285 -4.813031 0.266482 2.301442 0.284654
## 2 0.819521 -4.075192 0.335590 2.486855 0.368674
## 3 0.825288 -4.443179 0.311173 2.342259 0.332634
## 4 0.819235 -4.117501 0.334147 2.405554 0.368975
## 5 0.823484 -3.747787 0.234513 2.332180 0.410335
## 6 0.825069 -4.242867 0.299111 2.187560 0.357775

Averaging all the rows using aggregate by name (after removing the recording number, we are left with the individual ids)

pdavg=aggregate(pdavg[,2:24], list(pdavg$name), mean)
head(pdavg)
##        Group.1 MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 1 phon_R01_S01   118.71933     141.3128    106.02983    0.010085000
## 2 phon_R01_S02    99.77033     121.8943     95.41317    0.004585000
## 3 phon_R01_S04   147.34617     216.8675     87.53233    0.004346667
## 4 phon_R01_S05   159.83767     181.6302     86.76717    0.006246667
## 5 phon_R01_S06   150.64467     208.2643     78.27833    0.005230000
## 6 phon_R01_S07   200.26683     210.8843    194.36617    0.002163333
##   MDVP.Jitter.Abs.    MDVP.RAP    MDVP.PPQ  Jitter.DDP MDVP.Shimmer
## 1     8.666667e-05 0.004998333 0.007311667 0.014991667   0.05393167
## 2     5.000000e-05 0.002325000 0.002856667 0.006978333   0.02166833
## 3     3.000000e-05 0.001760000 0.002320000 0.005285000   0.01934333
## 4     4.000000e-05 0.003061667 0.003421667 0.009188333   0.04333667
## 5     3.666667e-05 0.002725000 0.002838333 0.008173333   0.02136667
## 6     9.666667e-06 0.001175000 0.001281667 0.003523333   0.01080333
##   MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5   MDVP.APQ Shimmer.DDA         NHR
## 1       0.51516667  0.028025000   0.03977000 0.03734833  0.08407333 0.016318333
## 2       0.19433333  0.011041667   0.01311333 0.01812333  0.03312500 0.008916667
## 3       0.18166667  0.009383333   0.01100333 0.01841000  0.02814833 0.013080000
## 4       0.38816667  0.020491667   0.02655000 0.04445833  0.06147667 0.025608333
## 5       0.20966667  0.009673333   0.01262500 0.02072500  0.02902333 0.014891667
## 6       0.09566667  0.005383333   0.00687000 0.00819500  0.01614833 0.001495000
##        HNR status      RPDE       DFA   spread1   spread2       D2        PPE
## 1 20.40667      1 0.4284877 0.8213137 -4.239926 0.2968360 2.342642 0.35384117
## 2 22.99733      1 0.5984292 0.7780167 -5.420414 0.3082893 2.287428 0.23401917
## 3 23.89967      1 0.5216600 0.6458433 -5.337281 0.2492883 2.360638 0.23200700
## 4 19.05867      1 0.6267230 0.6958855 -4.560947 0.2784825 2.787870 0.31065783
## 5 24.76200      1 0.4327235 0.7196750 -6.223537 0.2282963 2.440360 0.16493883
## 6 30.99217      0 0.3955782 0.7414823 -7.589537 0.1730488 1.795701 0.06811333
length(which(pdavg$status==0))
## [1] 8
length(which(pdavg$status==1))
## [1] 24

8 healthy patients and 24 patients with PD

Analysis on Full Dataset (All Measurements)

pd$status=as.factor(pd$status)
summary(pd)
##      name            MDVP.Fo.Hz.      MDVP.Fhi.Hz.    MDVP.Flo.Hz.   
##  Length:195         Min.   : 88.33   Min.   :102.1   Min.   : 65.48  
##  Class :character   1st Qu.:117.57   1st Qu.:134.9   1st Qu.: 84.29  
##  Mode  :character   Median :148.79   Median :175.8   Median :104.31  
##                     Mean   :154.23   Mean   :197.1   Mean   :116.32  
##                     3rd Qu.:182.77   3rd Qu.:224.2   3rd Qu.:140.02  
##                     Max.   :260.11   Max.   :592.0   Max.   :239.17  
##  MDVP.Jitter...     MDVP.Jitter.Abs.       MDVP.RAP           MDVP.PPQ       
##  Min.   :0.001680   Min.   :7.000e-06   Min.   :0.000680   Min.   :0.000920  
##  1st Qu.:0.003460   1st Qu.:2.000e-05   1st Qu.:0.001660   1st Qu.:0.001860  
##  Median :0.004940   Median :3.000e-05   Median :0.002500   Median :0.002690  
##  Mean   :0.006220   Mean   :4.396e-05   Mean   :0.003306   Mean   :0.003446  
##  3rd Qu.:0.007365   3rd Qu.:6.000e-05   3rd Qu.:0.003835   3rd Qu.:0.003955  
##  Max.   :0.033160   Max.   :2.600e-04   Max.   :0.021440   Max.   :0.019580  
##    Jitter.DDP        MDVP.Shimmer     MDVP.Shimmer.dB.  Shimmer.APQ3     
##  Min.   :0.002040   Min.   :0.00954   Min.   :0.0850   Min.   :0.004550  
##  1st Qu.:0.004985   1st Qu.:0.01650   1st Qu.:0.1485   1st Qu.:0.008245  
##  Median :0.007490   Median :0.02297   Median :0.2210   Median :0.012790  
##  Mean   :0.009920   Mean   :0.02971   Mean   :0.2823   Mean   :0.015664  
##  3rd Qu.:0.011505   3rd Qu.:0.03789   3rd Qu.:0.3500   3rd Qu.:0.020265  
##  Max.   :0.064330   Max.   :0.11908   Max.   :1.3020   Max.   :0.056470  
##   Shimmer.APQ5        MDVP.APQ        Shimmer.DDA           NHR          
##  Min.   :0.00570   Min.   :0.00719   Min.   :0.01364   Min.   :0.000650  
##  1st Qu.:0.00958   1st Qu.:0.01308   1st Qu.:0.02474   1st Qu.:0.005925  
##  Median :0.01347   Median :0.01826   Median :0.03836   Median :0.011660  
##  Mean   :0.01788   Mean   :0.02408   Mean   :0.04699   Mean   :0.024847  
##  3rd Qu.:0.02238   3rd Qu.:0.02940   3rd Qu.:0.06080   3rd Qu.:0.025640  
##  Max.   :0.07940   Max.   :0.13778   Max.   :0.16942   Max.   :0.314820  
##       HNR         status       RPDE             DFA            spread1      
##  Min.   : 8.441   0: 48   Min.   :0.2566   Min.   :0.5743   Min.   :-7.965  
##  1st Qu.:19.198   1:147   1st Qu.:0.4213   1st Qu.:0.6748   1st Qu.:-6.450  
##  Median :22.085           Median :0.4960   Median :0.7223   Median :-5.721  
##  Mean   :21.886           Mean   :0.4985   Mean   :0.7181   Mean   :-5.684  
##  3rd Qu.:25.076           3rd Qu.:0.5876   3rd Qu.:0.7619   3rd Qu.:-5.046  
##  Max.   :33.047           Max.   :0.6852   Max.   :0.8253   Max.   :-2.434  
##     spread2               D2             PPE         
##  Min.   :0.006274   Min.   :1.423   Min.   :0.04454  
##  1st Qu.:0.174350   1st Qu.:2.099   1st Qu.:0.13745  
##  Median :0.218885   Median :2.362   Median :0.19405  
##  Mean   :0.226510   Mean   :2.382   Mean   :0.20655  
##  3rd Qu.:0.279234   3rd Qu.:2.636   3rd Qu.:0.25298  
##  Max.   :0.450493   Max.   :3.671   Max.   :0.52737

Plotting for Visualization

pairs(pd[,-1], col=pd$status)

#red=parkinson's
plot(pd$PPE, pd$spread1, col=pd$status)

plot(pd$PPE, pd$spread1, col=pd$status)

pd2=pd[,-1]
rownames(pd2) = pd[,1]
pd2=pd2[,-17]
par(mfrow=c(3,3))
for (i in colnames(pd2))
  {hist(pd2[,i], main=i)}

par(mfrow=c(1,3))
for (i in colnames(pd2))
  {boxplot(pd2[,i], main=i)}

outl=boxplot.stats(pd2[,"MDVP.Flo.Hz."])$out
ind=which(pd2[,"MDVP.Flo.Hz."] %in% c(outl))
pd2[c(ind),]
##                MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## phon_R01_S10_1     237.226      247.326      225.227        0.00298
## phon_R01_S10_2     241.404      248.834      232.483        0.00281
## phon_R01_S10_3     243.439      250.912      232.435        0.00210
## phon_R01_S10_4     242.852      255.034      227.911        0.00225
## phon_R01_S10_5     245.510      262.090      231.848        0.00235
## phon_R01_S17_4     228.832      234.619      223.634        0.00296
## phon_R01_S42_2     237.323      243.709      229.256        0.00303
## phon_R01_S42_3     260.105      264.919      237.303        0.00339
## phon_R01_S42_6     244.990      272.210      239.170        0.00451
##                MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer
## phon_R01_S10_1            1e-05  0.00169  0.00182    0.00507      0.01752
## phon_R01_S10_2            1e-05  0.00157  0.00173    0.00470      0.01760
## phon_R01_S10_3            9e-06  0.00109  0.00137    0.00327      0.01419
## phon_R01_S10_4            9e-06  0.00117  0.00139    0.00350      0.01494
## phon_R01_S10_5            1e-05  0.00127  0.00148    0.00380      0.01608
## phon_R01_S17_4            1e-05  0.00175  0.00155    0.00526      0.01644
## phon_R01_S42_2            1e-05  0.00173  0.00159    0.00519      0.01242
## phon_R01_S42_3            1e-05  0.00205  0.00186    0.00616      0.02030
## phon_R01_S42_6            2e-05  0.00279  0.00237    0.00837      0.01897
##                MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA
## phon_R01_S10_1            0.164      0.01035      0.01024  0.01133     0.03104
## phon_R01_S10_2            0.154      0.01006      0.01038  0.01251     0.03017
## phon_R01_S10_3            0.126      0.00777      0.00898  0.01033     0.02330
## phon_R01_S10_4            0.134      0.00847      0.00879  0.01014     0.02542
## phon_R01_S10_5            0.141      0.00906      0.00977  0.01149     0.02719
## phon_R01_S17_4            0.145      0.00882      0.01075  0.01179     0.02647
## phon_R01_S42_2            0.116      0.00696      0.00747  0.00882     0.02089
## phon_R01_S42_3            0.197      0.01186      0.01230  0.01367     0.03557
## phon_R01_S42_6            0.181      0.01084      0.01121  0.01255     0.03253
##                    NHR    HNR     RPDE      DFA   spread1  spread2       D2
## phon_R01_S10_1 0.00740 22.736 0.305062 0.654172 -7.310550 0.098648 2.416838
## phon_R01_S10_2 0.00675 23.145 0.457702 0.634267 -6.793547 0.158266 2.256699
## phon_R01_S10_3 0.00454 25.368 0.438296 0.635285 -7.057869 0.091608 2.330716
## phon_R01_S10_4 0.00476 25.032 0.431285 0.638928 -6.995820 0.102083 2.365800
## phon_R01_S10_5 0.00476 24.602 0.467489 0.631653 -7.156076 0.127642 2.392122
## phon_R01_S17_4 0.00351 25.964 0.256570 0.683296 -7.245620 0.018689 2.498224
## phon_R01_S42_2 0.00533 24.679 0.384868 0.626710 -7.018057 0.176316 1.852402
## phon_R01_S42_3 0.00910 21.083 0.440988 0.628058 -7.517934 0.160414 1.881767
## phon_R01_S42_6 0.01049 21.528 0.522812 0.646818 -7.304500 0.171088 2.095237
##                     PPE
## phon_R01_S10_1 0.095032
## phon_R01_S10_2 0.117399
## phon_R01_S10_3 0.091470
## phon_R01_S10_4 0.102706
## phon_R01_S10_5 0.097336
## phon_R01_S17_4 0.093534
## phon_R01_S42_2 0.091604
## phon_R01_S42_3 0.075587
## phon_R01_S42_6 0.096220
pd[c(ind),]
##               name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 43  phon_R01_S10_1     237.226      247.326      225.227        0.00298
## 44  phon_R01_S10_2     241.404      248.834      232.483        0.00281
## 45  phon_R01_S10_3     243.439      250.912      232.435        0.00210
## 46  phon_R01_S10_4     242.852      255.034      227.911        0.00225
## 47  phon_R01_S10_5     245.510      262.090      231.848        0.00235
## 64  phon_R01_S17_4     228.832      234.619      223.634        0.00296
## 167 phon_R01_S42_2     237.323      243.709      229.256        0.00303
## 168 phon_R01_S42_3     260.105      264.919      237.303        0.00339
## 171 phon_R01_S42_6     244.990      272.210      239.170        0.00451
##     MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 43             1e-05  0.00169  0.00182    0.00507      0.01752            0.164
## 44             1e-05  0.00157  0.00173    0.00470      0.01760            0.154
## 45             9e-06  0.00109  0.00137    0.00327      0.01419            0.126
## 46             9e-06  0.00117  0.00139    0.00350      0.01494            0.134
## 47             1e-05  0.00127  0.00148    0.00380      0.01608            0.141
## 64             1e-05  0.00175  0.00155    0.00526      0.01644            0.145
## 167            1e-05  0.00173  0.00159    0.00519      0.01242            0.116
## 168            1e-05  0.00205  0.00186    0.00616      0.02030            0.197
## 171            2e-05  0.00279  0.00237    0.00837      0.01897            0.181
##     Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA     NHR    HNR status
## 43       0.01035      0.01024  0.01133     0.03104 0.00740 22.736      0
## 44       0.01006      0.01038  0.01251     0.03017 0.00675 23.145      0
## 45       0.00777      0.00898  0.01033     0.02330 0.00454 25.368      0
## 46       0.00847      0.00879  0.01014     0.02542 0.00476 25.032      0
## 47       0.00906      0.00977  0.01149     0.02719 0.00476 24.602      0
## 64       0.00882      0.01075  0.01179     0.02647 0.00351 25.964      0
## 167      0.00696      0.00747  0.00882     0.02089 0.00533 24.679      0
## 168      0.01186      0.01230  0.01367     0.03557 0.00910 21.083      0
## 171      0.01084      0.01121  0.01255     0.03253 0.01049 21.528      0
##         RPDE      DFA   spread1  spread2       D2      PPE
## 43  0.305062 0.654172 -7.310550 0.098648 2.416838 0.095032
## 44  0.457702 0.634267 -6.793547 0.158266 2.256699 0.117399
## 45  0.438296 0.635285 -7.057869 0.091608 2.330716 0.091470
## 46  0.431285 0.638928 -6.995820 0.102083 2.365800 0.102706
## 47  0.467489 0.631653 -7.156076 0.127642 2.392122 0.097336
## 64  0.256570 0.683296 -7.245620 0.018689 2.498224 0.093534
## 167 0.384868 0.626710 -7.018057 0.176316 1.852402 0.091604
## 168 0.440988 0.628058 -7.517934 0.160414 1.881767 0.075587
## 171 0.522812 0.646818 -7.304500 0.171088 2.095237 0.096220

I check some of the outliers in the MDVP.Flo.Hz. column.
5 of them come from the same individual, and the number of “outliers” is very high. Also, the data is skewed in general, so I don’t remove any of the outliers. Since the majority of measurements are diseased, and the outliers in one of the columns are all control, I think all the “outliers” are actually just control measurements.
I check a few more.

outl=boxplot.stats(pd2[,"MDVP.Fhi.Hz."])$out
ind=which(pd2[,"MDVP.Fhi.Hz."] %in% c(outl))
pd[c(ind),]
##               name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 74  phon_R01_S19_2     112.014      588.518      107.024        0.00533
## 103 phon_R01_S24_6     139.224      586.567       66.157        0.03011
## 116 phon_R01_S27_1     151.872      492.892       69.085        0.00856
## 117 phon_R01_S27_2     158.219      442.557       71.948        0.00476
## 118 phon_R01_S27_3     170.756      450.247       79.032        0.00555
## 119 phon_R01_S27_4     178.285      442.824       82.063        0.00462
## 121 phon_R01_S27_6     128.940      479.697       88.251        0.00581
## 150 phon_R01_S35_4     202.632      565.740      177.258        0.01627
## 187 phon_R01_S49_4     116.556      592.030       86.228        0.00496
## 188 phon_R01_S49_5     116.342      581.289       94.246        0.00267
## 194 phon_R01_S50_5     198.764      396.961       74.904        0.00740
##     MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 74           0.00005  0.00268  0.00329    0.00805      0.02448            0.226
## 103          0.00022  0.01854  0.01628    0.05563      0.09419            0.930
## 116          0.00006  0.00404  0.00385    0.01211      0.01843            0.235
## 117          0.00003  0.00214  0.00207    0.00642      0.01458            0.148
## 118          0.00003  0.00244  0.00261    0.00731      0.01725            0.175
## 119          0.00003  0.00157  0.00194    0.00472      0.01279            0.129
## 121          0.00005  0.00241  0.00314    0.00723      0.02008            0.221
## 150          0.00008  0.00919  0.00963    0.02756      0.07170            0.833
## 187          0.00004  0.00254  0.00263    0.00762      0.01660            0.154
## 188          0.00002  0.00115  0.00148    0.00345      0.01300            0.117
## 194          0.00004  0.00370  0.00390    0.01109      0.02296            0.241
##     Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA     NHR    HNR status
## 74       0.01373      0.01375  0.01956     0.04120 0.00623 24.178      1
## 103      0.05551      0.05005  0.06023     0.16654 0.25930 10.489      1
## 116      0.00796      0.00832  0.01271     0.02389 0.06051 23.693      1
## 117      0.00606      0.00747  0.01312     0.01818 0.01554 26.356      1
## 118      0.00757      0.00971  0.01652     0.02270 0.01802 25.690      1
## 119      0.00617      0.00744  0.01151     0.01851 0.00856 25.020      1
## 121      0.00849      0.01117  0.01734     0.02548 0.02350 24.743      1
## 150      0.03515      0.04265  0.06460     0.10546 0.07889 14.989      1
## 187      0.00820      0.00972  0.01491     0.02460 0.01397 23.958      0
## 188      0.00631      0.00789  0.01144     0.01892 0.00680 25.023      0
## 194      0.01265      0.01321  0.01588     0.03794 0.07223 19.020      0
##         RPDE      DFA   spread1  spread2       D2      PPE
## 74  0.509127 0.789532 -5.389129 0.306636 1.928708 0.225461
## 103 0.596362 0.641418 -3.269487 0.270641 2.690917 0.444774
## 116 0.407701 0.662668 -4.673241 0.261549 2.702355 0.274407
## 117 0.450798 0.653823 -6.051233 0.273280 2.640798 0.170106
## 118 0.486738 0.676023 -4.597834 0.372114 2.975889 0.282780
## 119 0.470422 0.655239 -4.913137 0.393056 2.816781 0.251972
## 121 0.487756 0.684130 -6.186128 0.279933 2.686240 0.152428
## 150 0.427627 0.775708 -4.892495 0.262281 2.910213 0.270173
## 187 0.566424 0.667654 -6.431119 0.153310 2.161936 0.120605
## 188 0.528485 0.663884 -6.359018 0.116636 2.152083 0.138868
## 194 0.451221 0.643956 -6.744577 0.207454 2.138608 0.123306

In this case many of the outlier measurements come from the same individual.

outl=boxplot.stats(pd2[,"MDVP.Jitter..."])$out
ind=which(pd2[,"MDVP.Jitter..."] %in% c(outl))
pd[c(ind),]
##               name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 99  phon_R01_S24_2     125.791      140.557       96.206        0.01378
## 100 phon_R01_S24_3     126.512      141.756       99.770        0.01936
## 101 phon_R01_S24_4     125.641      141.068      116.346        0.03316
## 102 phon_R01_S24_5     128.451      150.449       75.632        0.01551
## 103 phon_R01_S24_6     139.224      586.567       66.157        0.03011
## 147 phon_R01_S35_1     169.774      191.759      151.451        0.01568
## 148 phon_R01_S35_2     183.520      216.814      161.340        0.01466
## 149 phon_R01_S35_3     188.620      216.302      165.982        0.01719
## 150 phon_R01_S35_4     202.632      565.740      177.258        0.01627
## 151 phon_R01_S35_5     186.695      211.961      149.442        0.01872
## 152 phon_R01_S35_6     192.818      224.429      168.793        0.03107
## 153 phon_R01_S35_7     198.116      233.099      174.478        0.02714
## 158 phon_R01_S37_5     117.963      134.209      100.757        0.01813
## 193 phon_R01_S50_4     174.688      240.005       74.287        0.01360
##     MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 99           0.00011  0.00826  0.00655    0.02478      0.04689            0.422
## 100          0.00015  0.01159  0.00990    0.03476      0.06734            0.659
## 101          0.00026  0.02144  0.01522    0.06433      0.09178            0.891
## 102          0.00012  0.00905  0.00909    0.02716      0.06170            0.584
## 103          0.00022  0.01854  0.01628    0.05563      0.09419            0.930
## 147          0.00009  0.00863  0.00946    0.02589      0.08143            0.821
## 148          0.00008  0.00849  0.00819    0.02546      0.06050            0.618
## 149          0.00009  0.00996  0.01027    0.02987      0.07118            0.722
## 150          0.00008  0.00919  0.00963    0.02756      0.07170            0.833
## 151          0.00010  0.01075  0.01154    0.03225      0.05830            0.784
## 152          0.00016  0.01800  0.01958    0.05401      0.11908            1.302
## 153          0.00014  0.01568  0.01699    0.04705      0.08684            1.018
## 158          0.00015  0.01117  0.00718    0.03351      0.04912            0.438
## 193          0.00008  0.00624  0.00564    0.01873      0.02308            0.256
##     Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA     NHR    HNR status
## 99       0.02542      0.02630  0.03908     0.07625 0.10323 15.433      1
## 100      0.03611      0.03963  0.05783     0.10833 0.16744 12.435      1
## 101      0.05358      0.04791  0.06196     0.16074 0.31482  8.867      1
## 102      0.03223      0.03672  0.05174     0.09669 0.11843 15.060      1
## 103      0.05551      0.05005  0.06023     0.16654 0.25930 10.489      1
## 147      0.03804      0.05426  0.08808     0.11411 0.07530 12.359      1
## 148      0.02865      0.04101  0.06359     0.08595 0.06057 14.367      1
## 149      0.03474      0.04580  0.06824     0.10422 0.08069 12.298      1
## 150      0.03515      0.04265  0.06460     0.10546 0.07889 14.989      1
## 151      0.02699      0.03714  0.06259     0.08096 0.10952 12.529      1
## 152      0.05647      0.07940  0.13778     0.16942 0.21713  8.441      1
## 153      0.04284      0.05556  0.08318     0.12851 0.16265  9.449      1
## 158      0.02610      0.02161  0.02916     0.07830 0.10748 19.075      1
## 193      0.01268      0.01365  0.01667     0.03804 0.10715 17.883      0
##         RPDE      DFA   spread1  spread2       D2      PPE
## 99  0.571010 0.690892 -5.159169 0.202146 2.441612 0.260375
## 100 0.638545 0.674953 -3.760348 0.242861 2.634633 0.378483
## 101 0.671299 0.656846 -3.700544 0.260481 2.991063 0.370961
## 102 0.639808 0.643327 -4.202730 0.310163 2.638279 0.356881
## 103 0.596362 0.641418 -3.269487 0.270641 2.690917 0.444774
## 147 0.561610 0.793509 -3.297668 0.414758 3.413649 0.457533
## 148 0.478024 0.768974 -4.276605 0.355736 3.142364 0.336085
## 149 0.552870 0.764036 -3.377325 0.335357 3.274865 0.418646
## 150 0.427627 0.775708 -4.892495 0.262281 2.910213 0.270173
## 151 0.507826 0.762726 -4.484303 0.340256 2.958815 0.301487
## 152 0.625866 0.768320 -2.434031 0.450493 3.079221 0.527367
## 153 0.584164 0.754449 -2.839756 0.356224 3.184027 0.454721
## 158 0.630547 0.646786 -3.444478 0.303214 2.964568 0.261305
## 193 0.407567 0.655683 -6.787197 0.158453 2.679772 0.131728

In this case, outliers come from the same individual.

outl=boxplot.stats(pd2[,"MDVP.RAP"])$out
ind=which(pd2[,"MDVP.RAP"] %in% c(outl))
pd[c(ind),]
##               name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 98  phon_R01_S24_1     125.036      143.946      116.187        0.01280
## 99  phon_R01_S24_2     125.791      140.557       96.206        0.01378
## 100 phon_R01_S24_3     126.512      141.756       99.770        0.01936
## 101 phon_R01_S24_4     125.641      141.068      116.346        0.03316
## 102 phon_R01_S24_5     128.451      150.449       75.632        0.01551
## 103 phon_R01_S24_6     139.224      586.567       66.157        0.03011
## 147 phon_R01_S35_1     169.774      191.759      151.451        0.01568
## 148 phon_R01_S35_2     183.520      216.814      161.340        0.01466
## 149 phon_R01_S35_3     188.620      216.302      165.982        0.01719
## 150 phon_R01_S35_4     202.632      565.740      177.258        0.01627
## 151 phon_R01_S35_5     186.695      211.961      149.442        0.01872
## 152 phon_R01_S35_6     192.818      224.429      168.793        0.03107
## 153 phon_R01_S35_7     198.116      233.099      174.478        0.02714
## 158 phon_R01_S37_5     117.963      134.209      100.757        0.01813
##     MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 98           0.00010  0.00743  0.00623    0.02228      0.03886            0.342
## 99           0.00011  0.00826  0.00655    0.02478      0.04689            0.422
## 100          0.00015  0.01159  0.00990    0.03476      0.06734            0.659
## 101          0.00026  0.02144  0.01522    0.06433      0.09178            0.891
## 102          0.00012  0.00905  0.00909    0.02716      0.06170            0.584
## 103          0.00022  0.01854  0.01628    0.05563      0.09419            0.930
## 147          0.00009  0.00863  0.00946    0.02589      0.08143            0.821
## 148          0.00008  0.00849  0.00819    0.02546      0.06050            0.618
## 149          0.00009  0.00996  0.01027    0.02987      0.07118            0.722
## 150          0.00008  0.00919  0.00963    0.02756      0.07170            0.833
## 151          0.00010  0.01075  0.01154    0.03225      0.05830            0.784
## 152          0.00016  0.01800  0.01958    0.05401      0.11908            1.302
## 153          0.00014  0.01568  0.01699    0.04705      0.08684            1.018
## 158          0.00015  0.01117  0.00718    0.03351      0.04912            0.438
##     Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA     NHR    HNR status
## 98       0.02135      0.02174  0.03088     0.06406 0.08151 15.338      1
## 99       0.02542      0.02630  0.03908     0.07625 0.10323 15.433      1
## 100      0.03611      0.03963  0.05783     0.10833 0.16744 12.435      1
## 101      0.05358      0.04791  0.06196     0.16074 0.31482  8.867      1
## 102      0.03223      0.03672  0.05174     0.09669 0.11843 15.060      1
## 103      0.05551      0.05005  0.06023     0.16654 0.25930 10.489      1
## 147      0.03804      0.05426  0.08808     0.11411 0.07530 12.359      1
## 148      0.02865      0.04101  0.06359     0.08595 0.06057 14.367      1
## 149      0.03474      0.04580  0.06824     0.10422 0.08069 12.298      1
## 150      0.03515      0.04265  0.06460     0.10546 0.07889 14.989      1
## 151      0.02699      0.03714  0.06259     0.08096 0.10952 12.529      1
## 152      0.05647      0.07940  0.13778     0.16942 0.21713  8.441      1
## 153      0.04284      0.05556  0.08318     0.12851 0.16265  9.449      1
## 158      0.02610      0.02161  0.02916     0.07830 0.10748 19.075      1
##         RPDE      DFA   spread1  spread2       D2      PPE
## 98  0.629574 0.714485 -4.020042 0.265315 2.671825 0.340623
## 99  0.571010 0.690892 -5.159169 0.202146 2.441612 0.260375
## 100 0.638545 0.674953 -3.760348 0.242861 2.634633 0.378483
## 101 0.671299 0.656846 -3.700544 0.260481 2.991063 0.370961
## 102 0.639808 0.643327 -4.202730 0.310163 2.638279 0.356881
## 103 0.596362 0.641418 -3.269487 0.270641 2.690917 0.444774
## 147 0.561610 0.793509 -3.297668 0.414758 3.413649 0.457533
## 148 0.478024 0.768974 -4.276605 0.355736 3.142364 0.336085
## 149 0.552870 0.764036 -3.377325 0.335357 3.274865 0.418646
## 150 0.427627 0.775708 -4.892495 0.262281 2.910213 0.270173
## 151 0.507826 0.762726 -4.484303 0.340256 2.958815 0.301487
## 152 0.625866 0.768320 -2.434031 0.450493 3.079221 0.527367
## 153 0.584164 0.754449 -2.839756 0.356224 3.184027 0.454721
## 158 0.630547 0.646786 -3.444478 0.303214 2.964568 0.261305

Here, two individuals are “outliers.”
Dealing with outliers by either removing them or replacing them with the mean/median seems unnecessary - the data is just skewed in nature. Removing outliers would make the analysis biased.
So I decide to keep them. Since the data is not normally distributed (many predictors are skewed)

Unsupervised Learning Methods for Visualization

pca_pd=prcomp(pd2, scale=TRUE)
str(pca_pd)
## List of 5
##  $ sdev    : num [1:22] 3.6 1.577 1.242 1.21 0.987 ...
##  $ rotation: num [1:22, 1:22] 0.05333 -0.00671 0.06382 -0.25451 -0.24168 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##   .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:22] 1.54e+02 1.97e+02 1.16e+02 6.22e-03 4.40e-05 ...
##   ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##  $ scale   : Named num [1:22] 4.14e+01 9.15e+01 4.35e+01 4.85e-03 3.48e-05 ...
##   ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##  $ x       : num [1:195, 1:22] -2.09 -4.7 -3.84 -4.12 -5.68 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:195] "phon_R01_S01_1" "phon_R01_S01_2" "phon_R01_S01_3" "phon_R01_S01_4" ...
##   .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
#library(devtools)
#install_github("vqv/ggbiplot")
#library(ggbiplot)
ggbiplot(pca_pd)

biplot(pca_pd, scale=0)

ggbiplot(pca_pd, labels=rownames(pd2))

disease_stat=ifelse(pd$status=="1", "PD", "HLT")
ggbiplot(pca_pd,ellipse=TRUE, groups=disease_stat)

The PD measurements seem to be clustered to the bottom

head(pca_pd$rotation)
##                           PC1         PC2        PC3        PC4           PC5
## MDVP.Fo.Hz.       0.053331113  0.55340103 -0.1282881  0.1312285  0.1150947947
## MDVP.Fhi.Hz.     -0.006712501  0.34878153 -0.2676430 -0.2409897  0.1877101044
## MDVP.Flo.Hz.      0.063819423  0.39548027  0.2328452  0.2200752  0.2858261098
## MDVP.Jitter...   -0.254513242  0.08178929  0.1506745 -0.2516952  0.0711150522
## MDVP.Jitter.Abs. -0.241680694 -0.07677255  0.1852899 -0.3081169 -0.0008538928
## MDVP.RAP         -0.249823572  0.11603792  0.1692132 -0.2576196  0.0258294215
##                          PC6          PC7          PC8          PC9        PC10
## MDVP.Fo.Hz.       0.14766861 -0.003505647 -0.216748419 -0.064192702  0.64532341
## MDVP.Fhi.Hz.     -0.71903648  0.369740613 -0.054245149  0.153286499 -0.17370588
## MDVP.Flo.Hz.      0.46768146  0.455281251  0.007371196  0.079323633 -0.45949396
## MDVP.Jitter...    0.04166932 -0.021401088  0.032194922  0.012646201  0.11005180
## MDVP.Jitter.Abs.  0.03927228  0.004741437  0.088529516 -0.002100416 -0.13903402
## MDVP.RAP          0.07737102 -0.045632716  0.034029356  0.056594488  0.08351011
##                          PC11        PC12         PC13        PC14       PC15
## MDVP.Fo.Hz.      -0.107077243  0.25618422 -0.136797666  0.04811878 0.06876995
## MDVP.Fhi.Hz.     -0.006748088 -0.00293302  0.003450677 -0.02033489 0.02663274
## MDVP.Flo.Hz.     -0.072647366 -0.05807287  0.025845377 -0.02589691 0.00625575
## MDVP.Jitter...    0.077534938  0.02693779  0.157434525 -0.06472545 0.05479457
## MDVP.Jitter.Abs. -0.257337393  0.15269365  0.221431558  0.31804839 0.27478004
## MDVP.RAP          0.035878080  0.11951169  0.125351563 -0.23329677 0.05777257
##                          PC16        PC17         PC18          PC19
## MDVP.Fo.Hz.      -0.220928548  0.00253487 -0.012170838 -5.289131e-02
## MDVP.Fhi.Hz.     -0.001394872  0.04206616  0.007020829 -5.063792e-05
## MDVP.Flo.Hz.      0.026583245 -0.01825606 -0.010846368  2.240354e-02
## MDVP.Jitter...    0.001179548 -0.16366562 -0.445880827  7.448806e-01
## MDVP.Jitter.Abs. -0.633146349  0.10389484 -0.034033642 -2.186430e-01
## MDVP.RAP          0.245194725  0.19327305  0.345752801 -5.354520e-02
##                           PC20          PC21          PC22
## MDVP.Fo.Hz.       0.0007821080 -1.748182e-04  1.700574e-05
## MDVP.Fhi.Hz.     -0.0042671358 -1.990487e-05 -1.454460e-05
## MDVP.Flo.Hz.     -0.0005192626  6.257515e-05 -2.930880e-05
## MDVP.Jitter...   -0.0525475476  2.417675e-04 -5.619788e-05
## MDVP.Jitter.Abs. -0.0427786398 -1.893345e-04 -1.238837e-04
## MDVP.RAP         -0.0146062154  7.067090e-01 -2.157490e-02

Proprotion of Variance Explained by PC1

pr.var=pca_pd$sdev^2
pve=pr.var/sum(pr.var)
par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained ", ylim=c(0,1),type="b")
plot(cumsum(pve), xlab="Principal Component ", ylab=" Cumulative Proportion of Variance Explained ", ylim=c(0,1), type="b")

PC1 explains 58.9% of the variance.

Clustering

Clustering with K=2

km.pd2=kmeans(pd2,2,nstart=20)
head(km.pd2$cluster)
## phon_R01_S01_1 phon_R01_S01_2 phon_R01_S01_3 phon_R01_S01_4 phon_R01_S01_5 
##              2              2              2              2              2 
## phon_R01_S01_6 
##              2
head(pd$status)
## [1] 1 1 1 1 1 1
## Levels: 0 1
s=ifelse(pd$status==1, 1, 2) #so the colors match
head(s) 
## [1] 1 1 1 1 1 1
par(mfrow=c(1,2))
for (i in colnames(pd2))
{plot(pd2[,i], col=(km.pd2$cluster), main="K-Means Clustering; K=2", xlab="", ylab=i, pch=20, cex=2)
plot(pd2[,i], col=(s), main="By Status; black=PD", xlab="", ylab=i, pch=20, cex=2)}

Normalizing the Data

pd2=cbind(pd2, pd$status)
colnames(pd2)=c("MDVP.Fo.Hz.",      "MDVP.Fhi.Hz." ,    "MDVP.Flo.Hz."  ,   "MDVP.Jitter..." ,  "MDVP.Jitter.Abs." ,"MDVP.RAP"     ,   "MDVP.PPQ"    ,     "Jitter.DDP"  ,     "MDVP.Shimmer"   ,  "MDVP.Shimmer.dB." ,"Shimmer.APQ3"   ,  "Shimmer.APQ5" ,   
 "MDVP.APQ"  ,       "Shimmer.DDA"  ,    "NHR" ,             "HNR"    ,          "RPDE"    ,         "DFA"     ,        
"spread1"     ,     "spread2"      ,    "D2"        ,       "PPE"      ,        "status")

The data is skewed, so in order to normalize it, I log transform all the values. The values for spread1 are negative, so I add a constant to all the values.

offset=min(pd2[,"spread1"])
offset=-offset
offset
## [1] 7.964984
pd2norm=log(offset+1+pd2[1:22])
pd2norm=cbind(pd2norm, pd2[,"status"])
head(pd2norm)
##                MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## phon_R01_S01_1    4.859479     5.113595     4.430364       2.194200
## phon_R01_S01_2    4.877980     5.060155     4.810427       2.194405
## phon_R01_S01_3    4.833476     4.942185     4.791816       2.194497
## phon_R01_S01_4    4.833429     4.989316     4.790246       2.194438
## phon_R01_S01_5    4.828146     5.015596     4.784320       2.194758
## phon_R01_S01_6    4.863812     4.942549     4.810166       2.194405
##                MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer
## phon_R01_S01_1         2.193334 2.193739 2.193944   2.194563     2.198193
## phon_R01_S01_2         2.193335 2.193845 2.194102   2.194880     2.200145
## phon_R01_S01_3         2.193336 2.193933 2.194197   2.195146     2.199147
## phon_R01_S01_4         2.193336 2.193886 2.194105   2.195004     2.199434
## phon_R01_S01_5         2.193339 2.194057 2.194339   2.195517     2.200468
## phon_R01_S01_6         2.193335 2.193843 2.194163   2.194873     2.198556
##                MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA
## phon_R01_S01_1         2.239750     2.195757     2.196812 2.196635    2.200600
## phon_R01_S01_2         2.260823     2.196816     2.198353 2.198187    2.203760
## phon_R01_S01_3         2.245696     2.196397     2.197620 2.197323    2.202509
## phon_R01_S01_4         2.249394     2.196583     2.197784 2.197525    2.203062
## phon_R01_S01_5         2.256435     2.197212     2.198694 2.198294    2.204937
## phon_R01_S01_6         2.242940     2.195920     2.197252 2.196937    2.201088
##                     NHR      HNR     RPDE      DFA  spread1  spread2       D2
## phon_R01_S01_1 2.195790 3.401130 2.238555 2.280367 1.423579 2.222618 2.421827
## phon_R01_S01_2 2.195476 3.333988 2.243190 2.280800 1.587150 2.230076 2.438150
## phon_R01_S01_3 2.194785 3.388314 2.240165 2.281389 1.508911 2.227447 2.425443
## phon_R01_S01_4 2.194834 3.388078 2.240705 2.280771 1.578460 2.229921 2.431026
## phon_R01_S01_5 2.195295 3.353896 2.238829 2.281205 1.651960 2.219149 2.424552
## phon_R01_S01_6 2.194688 3.412565 2.238638 2.281367 1.552257 2.226146 2.411668
##                     PPE pd2[, "status"]
## phon_R01_S01_1 2.224584               1
## phon_R01_S01_2 2.233627               1
## phon_R01_S01_3 2.229758               1
## phon_R01_S01_4 2.233659               1
## phon_R01_S01_5 2.238081               1
## phon_R01_S01_6 2.232459               1
colnames(pd2norm)=c("MDVP.Fo.Hz.",      "MDVP.Fhi.Hz." ,    "MDVP.Flo.Hz."  ,   "MDVP.Jitter..." ,  "MDVP.Jitter.Abs." ,"MDVP.RAP"     ,   "MDVP.PPQ"    ,     "Jitter.DDP"  ,     "MDVP.Shimmer"   ,  "MDVP.Shimmer.dB." ,"Shimmer.APQ3"   ,  "Shimmer.APQ5" ,   
 "MDVP.APQ"  ,       "Shimmer.DDA"  ,    "NHR" ,             "HNR"    ,          "RPDE"    ,         "DFA"     ,        
"spread1"     ,     "spread2"      ,    "D2"        ,       "PPE"      ,        "status")

Plotting Again

par(mfrow=c(3,3))
for (i in colnames(pd2norm[1:22]))
  {hist(pd2norm[,i], main=i)}

par(mfrow=c(1,3))
for (i in colnames(pd2norm[1:22]))
  {boxplot(pd2norm[,i], main=i)}

PCA Again

pca_pd=prcomp(pd2norm[1:22], scale=TRUE)
str(pca_pd)
## List of 5
##  $ sdev    : num [1:22] 3.592 1.631 1.262 1.202 0.984 ...
##  $ rotation: num [1:22, 1:22] 0.048 0.0032 0.0612 -0.2549 -0.242 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##   .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:22] 5.06 5.26 4.78 2.19 2.19 ...
##   ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##  $ scale   : Named num [1:22] 2.47e-01 3.48e-01 3.12e-01 5.40e-04 3.88e-06 ...
##   ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##  $ x       : num [1:195, 1:22] -2.09 -4.63 -3.81 -4.06 -5.58 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:195] "phon_R01_S01_1" "phon_R01_S01_2" "phon_R01_S01_3" "phon_R01_S01_4" ...
##   .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
#library(devtools)
#install_github("vqv/ggbiplot")
#library(ggbiplot)
ggbiplot(pca_pd)

biplot(pca_pd, scale=0)

ggbiplot(pca_pd, labels=rownames(pd2))

disease_stat=ifelse(pd$status=="1", "PD", "HLT")
ggbiplot(pca_pd,ellipse=TRUE, groups=disease_stat)

The PD measurements seem to be clustered to the bottom

#pca_pd$rotation

I don’t end up using the normalized data because it has weird output.

Training and Test Sets

Since there are only 8 healthy individuals (around 84 measurements for healthy vs 147 for diseased), I make sure to include a sufficient number in the training and test sets, so I split them into healthy vs diseased, sample from each, and then combine them into training and test sets.

pd_dis_idx=which(pd2$status==1) #indeces of PD individuals
pd_health_idx=which(pd2$status==0) #indeces of healthy individuals
#pd_health_idx
#pd_dis_idx
set.seed(4706)

#nrow(pd_dis)
#nrow(pd_health)

#73 diseased test
#24 healthy test
#74 diseases train
#24 healthy train

test_health=sample(pd_health_idx, 24, replace = FALSE) #sample from indeces
#test_health

test_dis=sample(pd_dis_idx, 73, replace = FALSE) #sample from indeces
#test_dis

test=append(test_dis, test_health) #test set

tot=1:nrow(pd2)

train=tot[-test] #training set

y.train=pd2[c(train), "status"]
y.test=pd2[c(test), "status"]
y.train
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [77] 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0
## Levels: 0 1
y.test
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
## [77] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1

Best Subset Selection

regfit_full=regsubsets(status~., data=pd2, subset=train, nvmax=23) #nvmax=p
reg_full_sum=summary(regfit_full) #returns R^2, adjusted R^2, AIC, BIC, Cp
reg_full_sum
## Subset selection object
## Call: regsubsets.formula(status ~ ., data = pd2, subset = train, nvmax = 23)
## 22 Variables  (and intercept)
##                  Forced in Forced out
## MDVP.Fo.Hz.          FALSE      FALSE
## MDVP.Fhi.Hz.         FALSE      FALSE
## MDVP.Flo.Hz.         FALSE      FALSE
## MDVP.Jitter...       FALSE      FALSE
## MDVP.Jitter.Abs.     FALSE      FALSE
## MDVP.RAP             FALSE      FALSE
## MDVP.PPQ             FALSE      FALSE
## Jitter.DDP           FALSE      FALSE
## MDVP.Shimmer         FALSE      FALSE
## MDVP.Shimmer.dB.     FALSE      FALSE
## Shimmer.APQ3         FALSE      FALSE
## Shimmer.APQ5         FALSE      FALSE
## MDVP.APQ             FALSE      FALSE
## Shimmer.DDA          FALSE      FALSE
## NHR                  FALSE      FALSE
## HNR                  FALSE      FALSE
## RPDE                 FALSE      FALSE
## DFA                  FALSE      FALSE
## spread1              FALSE      FALSE
## spread2              FALSE      FALSE
## D2                   FALSE      FALSE
## PPE                  FALSE      FALSE
## 1 subsets of each size up to 22
## Selection Algorithm: exhaustive
##           MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.Jitter.Abs.
## 1  ( 1 )  " "         " "          " "          " "            " "             
## 2  ( 1 )  "*"         " "          " "          " "            " "             
## 3  ( 1 )  " "         " "          " "          " "            " "             
## 4  ( 1 )  " "         " "          " "          " "            " "             
## 5  ( 1 )  " "         " "          " "          " "            " "             
## 6  ( 1 )  " "         " "          " "          " "            " "             
## 7  ( 1 )  " "         " "          "*"          " "            " "             
## 8  ( 1 )  " "         " "          "*"          " "            " "             
## 9  ( 1 )  "*"         " "          " "          " "            "*"             
## 10  ( 1 ) "*"         " "          "*"          " "            "*"             
## 11  ( 1 ) "*"         " "          "*"          " "            "*"             
## 12  ( 1 ) "*"         " "          "*"          " "            "*"             
## 13  ( 1 ) "*"         " "          "*"          " "            "*"             
## 14  ( 1 ) "*"         " "          "*"          " "            "*"             
## 15  ( 1 ) "*"         " "          "*"          " "            "*"             
## 16  ( 1 ) "*"         " "          "*"          " "            "*"             
## 17  ( 1 ) "*"         " "          "*"          " "            "*"             
## 18  ( 1 ) "*"         " "          "*"          "*"            "*"             
## 19  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 20  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 21  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 22  ( 1 ) "*"         "*"          "*"          "*"            "*"             
##           MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 1  ( 1 )  " "      " "      " "        " "          " "             
## 2  ( 1 )  " "      " "      " "        " "          " "             
## 3  ( 1 )  " "      " "      " "        " "          " "             
## 4  ( 1 )  "*"      "*"      " "        " "          " "             
## 5  ( 1 )  " "      "*"      "*"        " "          "*"             
## 6  ( 1 )  " "      "*"      "*"        " "          "*"             
## 7  ( 1 )  " "      "*"      "*"        " "          "*"             
## 8  ( 1 )  "*"      "*"      " "        " "          "*"             
## 9  ( 1 )  "*"      "*"      " "        " "          "*"             
## 10  ( 1 ) "*"      "*"      " "        " "          "*"             
## 11  ( 1 ) "*"      "*"      " "        " "          "*"             
## 12  ( 1 ) "*"      "*"      " "        "*"          "*"             
## 13  ( 1 ) " "      "*"      "*"        "*"          "*"             
## 14  ( 1 ) "*"      "*"      " "        "*"          "*"             
## 15  ( 1 ) " "      "*"      "*"        "*"          "*"             
## 16  ( 1 ) " "      "*"      "*"        "*"          "*"             
## 17  ( 1 ) " "      "*"      "*"        "*"          "*"             
## 18  ( 1 ) " "      "*"      "*"        "*"          "*"             
## 19  ( 1 ) " "      "*"      "*"        "*"          "*"             
## 20  ( 1 ) " "      "*"      "*"        "*"          "*"             
## 21  ( 1 ) "*"      "*"      "*"        "*"          "*"             
## 22  ( 1 ) "*"      "*"      "*"        "*"          "*"             
##           Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR RPDE DFA
## 1  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 2  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 3  ( 1 )  " "          " "          " "      " "         " " " " " "  "*"
## 4  ( 1 )  " "          " "          " "      " "         " " " " " "  "*"
## 5  ( 1 )  " "          " "          " "      " "         " " " " " "  "*"
## 6  ( 1 )  "*"          " "          " "      " "         " " " " " "  "*"
## 7  ( 1 )  "*"          " "          " "      " "         " " " " " "  "*"
## 8  ( 1 )  "*"          " "          " "      " "         " " " " "*"  "*"
## 9  ( 1 )  " "          "*"          " "      " "         " " " " "*"  "*"
## 10  ( 1 ) "*"          " "          " "      " "         " " " " "*"  "*"
## 11  ( 1 ) "*"          " "          "*"      " "         " " " " "*"  "*"
## 12  ( 1 ) "*"          " "          "*"      " "         " " " " "*"  "*"
## 13  ( 1 ) "*"          "*"          "*"      " "         " " " " "*"  "*"
## 14  ( 1 ) "*"          " "          "*"      "*"         " " " " "*"  "*"
## 15  ( 1 ) "*"          "*"          "*"      "*"         " " " " "*"  "*"
## 16  ( 1 ) "*"          "*"          "*"      "*"         " " "*" "*"  "*"
## 17  ( 1 ) "*"          "*"          "*"      "*"         " " "*" "*"  "*"
## 18  ( 1 ) "*"          "*"          "*"      "*"         " " "*" "*"  "*"
## 19  ( 1 ) "*"          "*"          "*"      "*"         " " "*" "*"  "*"
## 20  ( 1 ) "*"          "*"          "*"      "*"         " " "*" "*"  "*"
## 21  ( 1 ) "*"          "*"          "*"      "*"         " " "*" "*"  "*"
## 22  ( 1 ) "*"          "*"          "*"      "*"         "*" "*" "*"  "*"
##           spread1 spread2 D2  PPE
## 1  ( 1 )  "*"     " "     " " " "
## 2  ( 1 )  "*"     " "     " " " "
## 3  ( 1 )  "*"     " "     " " "*"
## 4  ( 1 )  "*"     " "     " " " "
## 5  ( 1 )  "*"     " "     " " " "
## 6  ( 1 )  "*"     " "     " " " "
## 7  ( 1 )  "*"     " "     " " " "
## 8  ( 1 )  "*"     " "     " " " "
## 9  ( 1 )  "*"     " "     " " " "
## 10  ( 1 ) "*"     " "     " " " "
## 11  ( 1 ) "*"     " "     " " " "
## 12  ( 1 ) "*"     " "     " " " "
## 13  ( 1 ) "*"     " "     " " " "
## 14  ( 1 ) "*"     "*"     " " " "
## 15  ( 1 ) "*"     "*"     " " " "
## 16  ( 1 ) "*"     "*"     " " " "
## 17  ( 1 ) "*"     "*"     "*" " "
## 18  ( 1 ) "*"     "*"     "*" " "
## 19  ( 1 ) "*"     "*"     "*" " "
## 20  ( 1 ) "*"     "*"     "*" "*"
## 21  ( 1 ) "*"     "*"     "*" "*"
## 22  ( 1 ) "*"     "*"     "*" "*"
names(reg_full_sum)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
names(regfit_full)
##  [1] "np"        "nrbar"     "d"         "rbar"      "thetab"    "first"    
##  [7] "last"      "vorder"    "tol"       "rss"       "bound"     "nvmax"    
## [13] "ress"      "ir"        "nbest"     "lopt"      "il"        "ier"      
## [19] "xnames"    "method"    "force.in"  "force.out" "sserr"     "intercept"
## [25] "lindep"    "nullrss"   "nn"        "call"

Plotting

par(mfrow=c(2,2))

#RSS
plot(reg_full_sum$rss, xlab="Number of variables", ylab="RSS", type ="l")
mnrss=which.min(reg_full_sum$rss)
#Plotting min RSS
points(mnrss, reg_full_sum$adjr2[mnrss], col="red", cex=2, pch=20)

#Adjusted R^2
plot(reg_full_sum$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type ="l")
mxr2=which.max(reg_full_sum$adjr2)
#Plotting max adjusted R^2
points(mxr2, reg_full_sum$adjr2[mxr2], col="red", cex=2, pch=20)

#CP
plot(reg_full_sum$cp, xlab = "Number of variables", ylab="CP", type = "l")
mncp=which.min(reg_full_sum$cp)
points(mncp, reg_full_sum$cp[mncp], col="blue", cex=2, pch=20)

#BIC
plot(reg_full_sum$bic, xlab="Number of variables", ylab = "BIC", type="l")
mnbic=which.min(reg_full_sum$bic)
points(mnbic, reg_full_sum$bic[mnbic], col="green", pch=20, cex=2)

BIC selects for a smaller model (5 predictors) (heavier penalty)
CP min chooses around 6.
Adjusted R^2 max is around 8.
As expected, RSS decreases as more predictors are added, but this is not indicative of test error rate.

Plotting Selected Variables

plot(regfit_full, scale="r2")

plot(regfit_full, scale="adjr2")

plot(regfit_full, scale="Cp")

plot(regfit_full, scale="bic")

Coef of models

"model with highest adjusted R^2"
## [1] "model with highest adjusted R^2"
coef(regfit_full, mxr2) #model with highest adjusted R^2
##      (Intercept)     MDVP.Flo.Hz.         MDVP.RAP         MDVP.PPQ 
##     2.746547e+00    -1.486939e-03     1.527527e+02    -2.698480e+02 
## MDVP.Shimmer.dB.     Shimmer.APQ3             RPDE              DFA 
##     2.178926e+00    -2.396256e+01    -6.369941e-01     2.261761e+00 
##          spread1 
##     3.434226e-01
"model with lowest CP"
## [1] "model with lowest CP"
coef(regfit_full, mncp) #model with lowest CP
##      (Intercept)         MDVP.PPQ       Jitter.DDP MDVP.Shimmer.dB. 
##        1.9776112     -265.6049616       51.6839298        1.8572010 
##     Shimmer.APQ3              DFA          spread1 
##      -20.8234354        2.5707226        0.3296206
"model with lowest BIC"
## [1] "model with lowest BIC"
coef(regfit_full, mnbic) #model with lowest BIC
##      (Intercept)         MDVP.PPQ       Jitter.DDP MDVP.Shimmer.dB. 
##        2.0008538     -209.8104429       38.6909884        0.7094366 
##              DFA          spread1 
##        2.3236992        0.3135981

Ridge Regression & Lasso

x=model.matrix(status~., pd2)[,-1] #[,-1] to remove the intercept
y=pd2$status
#automatically transforms any qualitative variables into dummy variables

Ridge Regression

#standardizes by default
#alpha=0 ridge regression,  alpha=1  lasso
grid=10^seq(10, -2, length=100) #10 to the power of a sequence -> our lambda values
grid
##   [1] 1.000000e+10 7.564633e+09 5.722368e+09 4.328761e+09 3.274549e+09
##   [6] 2.477076e+09 1.873817e+09 1.417474e+09 1.072267e+09 8.111308e+08
##  [11] 6.135907e+08 4.641589e+08 3.511192e+08 2.656088e+08 2.009233e+08
##  [16] 1.519911e+08 1.149757e+08 8.697490e+07 6.579332e+07 4.977024e+07
##  [21] 3.764936e+07 2.848036e+07 2.154435e+07 1.629751e+07 1.232847e+07
##  [26] 9.326033e+06 7.054802e+06 5.336699e+06 4.037017e+06 3.053856e+06
##  [31] 2.310130e+06 1.747528e+06 1.321941e+06 1.000000e+06 7.564633e+05
##  [36] 5.722368e+05 4.328761e+05 3.274549e+05 2.477076e+05 1.873817e+05
##  [41] 1.417474e+05 1.072267e+05 8.111308e+04 6.135907e+04 4.641589e+04
##  [46] 3.511192e+04 2.656088e+04 2.009233e+04 1.519911e+04 1.149757e+04
##  [51] 8.697490e+03 6.579332e+03 4.977024e+03 3.764936e+03 2.848036e+03
##  [56] 2.154435e+03 1.629751e+03 1.232847e+03 9.326033e+02 7.054802e+02
##  [61] 5.336699e+02 4.037017e+02 3.053856e+02 2.310130e+02 1.747528e+02
##  [66] 1.321941e+02 1.000000e+02 7.564633e+01 5.722368e+01 4.328761e+01
##  [71] 3.274549e+01 2.477076e+01 1.873817e+01 1.417474e+01 1.072267e+01
##  [76] 8.111308e+00 6.135907e+00 4.641589e+00 3.511192e+00 2.656088e+00
##  [81] 2.009233e+00 1.519911e+00 1.149757e+00 8.697490e-01 6.579332e-01
##  [86] 4.977024e-01 3.764936e-01 2.848036e-01 2.154435e-01 1.629751e-01
##  [91] 1.232847e-01 9.326033e-02 7.054802e-02 5.336699e-02 4.037017e-02
##  [96] 3.053856e-02 2.310130e-02 1.747528e-02 1.321941e-02 1.000000e-02
ridge.mod=glmnet(x, y, alpha=0, lambda=grid, family = "binomial")
#rows (predictors), columns (lambda values)
ridge.mod$lambda[50] #lambda value
## [1] 11497.57
coef(ridge.mod)[,50] #coef for this lambda value (index)
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##     1.118985e+00    -3.480464e-07    -6.820632e-08    -3.281221e-07 
##   MDVP.Jitter... MDVP.Jitter.Abs.         MDVP.RAP         MDVP.PPQ 
##     2.155138e-03     3.652407e-01     3.374402e-03     3.929665e-03 
##       Jitter.DDP     MDVP.Shimmer MDVP.Shimmer.dB.     Shimmer.APQ3 
##     1.124699e-03     7.317776e-04     6.758354e-05     1.285790e-03 
##     Shimmer.APQ5         MDVP.APQ      Shimmer.DDA              NHR 
##     1.096786e-03     8.073509e-04     4.285862e-04     1.759856e-04 
##              HNR             RPDE              DFA          spread1 
##    -3.067666e-06     1.114926e-04     1.572944e-04     1.945867e-05 
##          spread2               D2              PPE 
##     2.048184e-04     3.338087e-05     2.213090e-04

Estimating Test Error

ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12, family = "binomial")
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)
## Warning in Ops.factor(ridge.pred, y.test): '-' not meaningful for factors
## [1] NA
#check vs least squares lambda=0
ridge.pred=predict(ridge.mod, s=0, newx=x[test,], exact = T, x=x[train,], y=y[train]) 
## Warning: from glmnet Fortran code (error code -101); Convergence for 101th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
mean((ridge.pred-y.test)^2)
## Warning in Ops.factor(ridge.pred, y.test): '-' not meaningful for factors
## [1] NA
lm(y~x, subset = train)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## 
## Call:
## lm(formula = y ~ x, subset = train)
## 
## Coefficients:
##       (Intercept)       xMDVP.Fo.Hz.      xMDVP.Fhi.Hz.      xMDVP.Flo.Hz.  
##         2.543e+00         -2.456e-03          1.312e-04         -1.156e-03  
##   xMDVP.Jitter...  xMDVP.Jitter.Abs.          xMDVP.RAP          xMDVP.PPQ  
##        -2.589e+01         -8.314e+03         -4.337e+02         -1.994e+02  
##       xJitter.DDP      xMDVP.Shimmer  xMDVP.Shimmer.dB.      xShimmer.APQ3  
##         2.222e+02          4.607e+01          1.584e+00         -9.141e+03  
##     xShimmer.APQ5          xMDVP.APQ       xShimmer.DDA               xNHR  
##        -1.380e+01         -1.707e+01          3.027e+03         -5.984e-02  
##              xHNR              xRPDE               xDFA           xspread1  
##         9.963e-03         -4.796e-01          1.921e+00          3.055e-01  
##          xspread2                xD2               xPPE  
##         3.100e-01          5.409e-02          3.579e-01
predict(ridge.mod,s=0, exact = T, type="coefficients", x=x[train,], y=y[train])[1:20,]
## Warning: from glmnet Fortran code (error code -101); Convergence for 101th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##    -4.530045e+00    -8.450098e-03    -1.067700e-03    -3.164070e-03 
##   MDVP.Jitter... MDVP.Jitter.Abs.         MDVP.RAP         MDVP.PPQ 
##    -4.930979e+01    -1.025792e+04     3.104202e+01    -9.981425e+01 
##       Jitter.DDP     MDVP.Shimmer MDVP.Shimmer.dB.     Shimmer.APQ3 
##     1.041818e+01     2.190459e+01     2.310188e+00     1.219492e+01 
##     Shimmer.APQ5         MDVP.APQ      Shimmer.DDA              NHR 
##     2.613212e+01     3.634479e+01     4.069749e+00     1.922760e+00 
##              HNR             RPDE              DFA          spread1 
##     8.991673e-02    -3.457758e+00     9.767619e+00     9.881682e-01

Cross Validation for Ridge

set.seed(4706)
cv.out=cv.glmnet(x[train,], y[train], alpha=0, family = "binomial")
plot(cv.out)

bestlam=cv.out$lambda.min
bestlam
## [1] 0.03760963
#predict on test using best lambda
ridge.pred=predict(ridge.mod, s=bestlam, newx=x[test,])
mean((ridge.pred-y.test)^2)
## Warning in Ops.factor(ridge.pred, y.test): '-' not meaningful for factors
## [1] NA
#refit on full data
out=glmnet(x,y,alpha=0, family = "binomial")
predict(out,type="coefficients",s=bestlam)[1:20,]
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##    -1.053252e+00    -6.012875e-03    -2.183152e-03    -2.945454e-03 
##   MDVP.Jitter... MDVP.Jitter.Abs.         MDVP.RAP         MDVP.PPQ 
##    -2.128719e+01    -1.543317e+03     3.025824e+01    -5.665370e+00 
##       Jitter.DDP     MDVP.Shimmer MDVP.Shimmer.dB.     Shimmer.APQ3 
##     1.008551e+01     8.520327e+00     6.977416e-01     6.524013e+00 
##     Shimmer.APQ5         MDVP.APQ      Shimmer.DDA              NHR 
##     1.387721e+01     1.560451e+01     2.191096e+00    -3.414915e+00 
##              HNR             RPDE              DFA          spread1 
##     5.994621e-03    -1.577644e+00     3.156172e+00     5.734877e-01

Lasso

#Fitting on training data
lasso.mod=glmnet(x[train,], y[train], alpha=1, lambda=grid, family="binomial")
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

#Getting best lambda 
set.seed(4706)
cv.out=cv.glmnet(x[train,], y[train], alpha=1, family="binomial")
plot(cv.out)

bestlam=cv.out$lambda.min #lambda.min=lambda with lowest error
bestlam
## [1] 0.02307692
lasso.pred=predict(lasso.mod, s=bestlam, newx=x[test,])
mean((lasso.pred!=y.test))
## [1] 1
#Refitting on full data using best lambda
out=glmnet(x, y, alpha=1, lambda=grid, family="binomial")
lasso.coef=predict(out, type="coefficients", s=bestlam)
lasso.coef
## 23 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)       5.047928708
## MDVP.Fo.Hz.      -0.001916708
## MDVP.Fhi.Hz.     -0.001457227
## MDVP.Flo.Hz.      .          
## MDVP.Jitter...    .          
## MDVP.Jitter.Abs.  .          
## MDVP.RAP          .          
## MDVP.PPQ          .          
## Jitter.DDP        .          
## MDVP.Shimmer      .          
## MDVP.Shimmer.dB.  .          
## Shimmer.APQ3      .          
## Shimmer.APQ5      .          
## MDVP.APQ         13.251313646
## Shimmer.DDA       .          
## NHR               .          
## HNR               .          
## RPDE              .          
## DFA               2.568419871
## spread1           1.364326482
## spread2           3.328087998
## D2                0.961838161
## PPE               .

Predictors chosen by lasso: MDVP.Fo.Hz.+MDVP.Fhi.Hz+MDVP.APQ+DFA+spread1+spread2+D2

Logistic Regression

I fit a logistic regression model on the training data

logist_reg_fit=glm(status~., data=pd2, family=binomial, subset=train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logist_reg_fit)
## 
## Call:
## glm(formula = status ~ ., family = binomial, data = pd2, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.25209   0.00000   0.00454   0.14252   1.50515  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      -1.857e+01  4.922e+01  -0.377   0.7059  
## MDVP.Fo.Hz.       3.087e-03  5.866e-02   0.053   0.9580  
## MDVP.Fhi.Hz.      3.238e-03  1.513e-02   0.214   0.8305  
## MDVP.Flo.Hz.     -1.936e-02  3.836e-02  -0.505   0.6138  
## MDVP.Jitter...   -1.497e+03  2.542e+03  -0.589   0.5561  
## MDVP.Jitter.Abs. -2.389e+05  2.527e+05  -0.945   0.3446  
## MDVP.RAP         -6.776e+04  2.884e+05  -0.235   0.8143  
## MDVP.PPQ         -1.864e+03  3.768e+03  -0.495   0.6207  
## Jitter.DDP        2.365e+04  9.598e+04   0.246   0.8053  
## MDVP.Shimmer      5.099e+03  4.432e+03   1.151   0.2499  
## MDVP.Shimmer.dB.  8.412e+01  1.231e+02   0.683   0.4944  
## Shimmer.APQ3      1.352e+05  3.108e+05   0.435   0.6636  
## Shimmer.APQ5     -2.056e+03  1.710e+03  -1.202   0.2292  
## MDVP.APQ         -1.068e+03  1.249e+03  -0.855   0.3925  
## Shimmer.DDA      -4.701e+04  1.037e+05  -0.453   0.6502  
## NHR               8.646e+01  1.023e+02   0.845   0.3982  
## HNR               4.297e-01  5.681e-01   0.756   0.4494  
## RPDE              4.853e-01  9.039e+00   0.054   0.9572  
## DFA               5.860e+01  2.999e+01   1.954   0.0507 .
## spread1           6.140e+00  6.336e+00   0.969   0.3326  
## spread2          -3.070e+00  1.360e+01  -0.226   0.8214  
## D2                9.834e-01  2.602e+00   0.378   0.7055  
## PPE               5.662e+00  6.918e+01   0.082   0.9348  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 109.105  on 97  degrees of freedom
## Residual deviance:  32.917  on 75  degrees of freedom
## AIC: 78.917
## 
## Number of Fisher Scoring iterations: 11
#"Coefficients"
#coef(logist_reg_fit)

#"P-Values"
#summary(logist_reg_fit)$coef[ ,4]
step(logist_reg_fit, direction = "forward")
## Start:  AIC=78.92
## status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + 
##     MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + 
##     MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + 
##     Shimmer.DDA + NHR + HNR + RPDE + DFA + spread1 + spread2 + 
##     D2 + PPE
## 
## Call:  glm(formula = status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. + 
##     MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ + 
##     Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 + 
##     Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + RPDE + 
##     DFA + spread1 + spread2 + D2 + PPE, family = binomial, data = pd2, 
##     subset = train)
## 
## Coefficients:
##      (Intercept)       MDVP.Fo.Hz.      MDVP.Fhi.Hz.      MDVP.Flo.Hz.  
##       -1.857e+01         3.087e-03         3.238e-03        -1.936e-02  
##   MDVP.Jitter...  MDVP.Jitter.Abs.          MDVP.RAP          MDVP.PPQ  
##       -1.497e+03        -2.389e+05        -6.776e+04        -1.864e+03  
##       Jitter.DDP      MDVP.Shimmer  MDVP.Shimmer.dB.      Shimmer.APQ3  
##        2.365e+04         5.099e+03         8.412e+01         1.352e+05  
##     Shimmer.APQ5          MDVP.APQ       Shimmer.DDA               NHR  
##       -2.056e+03        -1.068e+03        -4.701e+04         8.646e+01  
##              HNR              RPDE               DFA           spread1  
##        4.297e-01         4.853e-01         5.860e+01         6.140e+00  
##          spread2                D2               PPE  
##       -3.070e+00         9.834e-01         5.662e+00  
## 
## Degrees of Freedom: 97 Total (i.e. Null);  75 Residual
## Null Deviance:       109.1 
## Residual Deviance: 32.92     AIC: 78.92

MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.Flo.Hz.+MDVP.Jitter…+MDVP.Jitter.Abs.+MDVP.RAP

step(logist_reg_fit, direction = "backward")
## Start:  AIC=78.92
## status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + 
##     MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + 
##     MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + 
##     Shimmer.DDA + NHR + HNR + RPDE + DFA + spread1 + spread2 + 
##     D2 + PPE
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - MDVP.Fo.Hz.       1   32.920 76.920
## - RPDE              1   32.920 76.920
## - PPE               1   32.924 76.924
## - spread2           1   32.969 76.969
## - MDVP.RAP          1   32.971 76.971
## - MDVP.Fhi.Hz.      1   32.973 76.973
## - Jitter.DDP        1   32.977 76.977
## - D2                1   33.062 77.062
## - Shimmer.APQ3      1   33.127 77.127
## - Shimmer.DDA       1   33.147 77.147
## - MDVP.PPQ          1   33.172 77.172
## - MDVP.Flo.Hz.      1   33.192 77.192
## - MDVP.Jitter...    1   33.281 77.281
## - MDVP.Shimmer.dB.  1   33.453 77.453
## - HNR               1   33.568 77.568
## - MDVP.APQ          1   33.709 77.709
## - NHR               1   33.838 77.838
## - MDVP.Jitter.Abs.  1   33.954 77.954
## - MDVP.Shimmer      1   34.551 78.551
## - spread1           1   34.758 78.758
## - Shimmer.APQ5      1   34.780 78.780
## <none>                  32.917 78.917
## - DFA               1   38.827 82.827
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=76.92
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + 
##     MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + 
##     Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + 
##     HNR + RPDE + DFA + spread1 + spread2 + D2 + PPE
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - RPDE              1   32.925 74.925
## - PPE               1   32.930 74.930
## - MDVP.RAP          1   32.972 74.972
## - spread2           1   32.976 74.976
## - Jitter.DDP        1   32.977 74.977
## - MDVP.Fhi.Hz.      1   32.987 74.987
## - D2                1   33.086 75.086
## - Shimmer.APQ3      1   33.131 75.131
## - Shimmer.DDA       1   33.150 75.150
## - MDVP.PPQ          1   33.173 75.173
## - MDVP.Flo.Hz.      1   33.240 75.240
## - MDVP.Jitter...    1   33.283 75.283
## - MDVP.Shimmer.dB.  1   33.560 75.560
## - HNR               1   33.584 75.584
## - MDVP.APQ          1   33.725 75.725
## - NHR               1   33.855 75.855
## - MDVP.Shimmer      1   34.775 76.775
## - spread1           1   34.819 76.819
## <none>                  32.920 76.920
## - MDVP.Jitter.Abs.  1   35.153 77.153
## - Shimmer.APQ5      1   35.156 77.156
## - DFA               1   43.848 85.848
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=74.92
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + 
##     MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + 
##     Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + 
##     HNR + DFA + spread1 + spread2 + D2 + PPE
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - PPE               1   32.936 72.936
## - MDVP.RAP          1   32.972 72.972
## - Jitter.DDP        1   32.978 72.978
## - spread2           1   32.991 72.991
## - MDVP.Fhi.Hz.      1   33.004 73.004
## - D2                1   33.111 73.111
## - Shimmer.APQ3      1   33.131 73.131
## - Shimmer.DDA       1   33.150 73.150
## - MDVP.PPQ          1   33.219 73.219
## - MDVP.Flo.Hz.      1   33.241 73.241
## - MDVP.Jitter...    1   33.331 73.331
## - MDVP.Shimmer.dB.  1   33.593 73.593
## - HNR               1   33.645 73.645
## - MDVP.APQ          1   33.742 73.742
## - NHR               1   33.880 73.880
## - MDVP.Shimmer      1   34.851 74.851
## - spread1           1   34.865 74.865
## <none>                  32.925 74.925
## - Shimmer.APQ5      1   35.193 75.193
## - MDVP.Jitter.Abs.  1   35.840 75.840
## - DFA               1   44.698 84.698
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=72.94
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + 
##     MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + 
##     Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + 
##     HNR + DFA + spread1 + spread2 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - MDVP.RAP          1   32.979 70.979
## - Jitter.DDP        1   32.985 70.985
## - MDVP.Fhi.Hz.      1   33.007 71.007
## - spread2           1   33.014 71.014
## - D2                1   33.128 71.128
## - Shimmer.APQ3      1   33.159 71.159
## - Shimmer.DDA       1   33.180 71.180
## - MDVP.PPQ          1   33.226 71.226
## - MDVP.Flo.Hz.      1   33.242 71.242
## - MDVP.Jitter...    1   33.381 71.381
## - HNR               1   33.646 71.646
## - MDVP.Shimmer.dB.  1   33.707 71.707
## - MDVP.APQ          1   33.855 71.855
## - NHR               1   33.884 71.884
## <none>                  32.936 72.936
## - MDVP.Shimmer      1   35.000 73.000
## - Shimmer.APQ5      1   35.433 73.433
## - MDVP.Jitter.Abs.  1   35.956 73.956
## - DFA               1   44.698 82.698
## - spread1           1   45.859 83.859
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=70.98
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + 
##     MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + 
##     Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + 
##     HNR + DFA + spread1 + spread2 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - MDVP.Fhi.Hz.      1   33.041 69.041
## - spread2           1   33.076 69.076
## - D2                1   33.243 69.243
## - MDVP.Flo.Hz.      1   33.298 69.298
## - MDVP.PPQ          1   33.341 69.341
## - Shimmer.APQ3      1   33.347 69.347
## - Shimmer.DDA       1   33.377 69.377
## - MDVP.Jitter...    1   33.456 69.456
## - HNR               1   33.654 69.654
## - NHR               1   33.885 69.885
## - MDVP.APQ          1   33.993 69.993
## - MDVP.Shimmer.dB.  1   34.275 70.275
## <none>                  32.979 70.979
## - MDVP.Shimmer      1   35.070 71.070
## - Jitter.DDP        1   35.396 71.396
## - Shimmer.APQ5      1   35.534 71.534
## - MDVP.Jitter.Abs.  1   36.284 72.284
## - DFA               1   45.193 81.193
## - spread1           1   46.724 82.724
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=69.04
## status ~ MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + 
##     Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 + 
##     Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA + 
##     spread1 + spread2 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - spread2           1   33.102 67.102
## - D2                1   33.334 67.334
## - MDVP.PPQ          1   33.349 67.349
## - MDVP.Flo.Hz.      1   33.365 67.365
## - Shimmer.APQ3      1   33.404 67.404
## - Shimmer.DDA       1   33.432 67.432
## - MDVP.Jitter...    1   33.545 67.545
## - HNR               1   33.832 67.832
## - MDVP.APQ          1   34.012 68.012
## - NHR               1   34.076 68.076
## - MDVP.Shimmer.dB.  1   34.828 68.828
## <none>                  33.041 69.041
## - MDVP.Shimmer      1   35.109 69.109
## - Jitter.DDP        1   35.402 69.402
## - Shimmer.APQ5      1   35.606 69.606
## - MDVP.Jitter.Abs.  1   37.812 71.812
## - DFA               1   46.159 80.159
## - spread1           1   46.770 80.770
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=67.1
## status ~ MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + 
##     Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 + 
##     Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA + 
##     spread1 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - D2                1   33.355 65.355
## - MDVP.PPQ          1   33.404 65.404
## - MDVP.Flo.Hz.      1   33.412 65.412
## - Shimmer.APQ3      1   33.461 65.461
## - Shimmer.DDA       1   33.487 65.487
## - MDVP.Jitter...    1   33.620 65.620
## - HNR               1   33.833 65.833
## - MDVP.APQ          1   34.015 66.015
## - NHR               1   34.078 66.078
## - MDVP.Shimmer.dB.  1   35.038 67.038
## <none>                  33.102 67.102
## - MDVP.Shimmer      1   35.112 67.112
## - Shimmer.APQ5      1   35.634 67.634
## - Jitter.DDP        1   35.977 67.977
## - MDVP.Jitter.Abs.  1   37.880 69.880
## - DFA               1   46.248 78.248
## - spread1           1   52.354 84.354
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=65.36
## status ~ MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + 
##     Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 + 
##     Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA + 
##     spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - MDVP.Flo.Hz.      1   33.547 63.547
## - Shimmer.APQ3      1   33.665 63.665
## - Shimmer.DDA       1   33.691 63.691
## - MDVP.Jitter...    1   33.758 63.758
## - HNR               1   33.870 63.870
## - MDVP.PPQ          1   33.938 63.938
## - NHR               1   34.278 64.278
## - MDVP.APQ          1   34.794 64.794
## - MDVP.Shimmer.dB.  1   35.162 65.162
## <none>                  33.355 65.355
## - Shimmer.APQ5      1   35.968 65.968
## - MDVP.Shimmer      1   36.001 66.001
## - Jitter.DDP        1   36.556 66.556
## - MDVP.Jitter.Abs.  1   38.321 68.321
## - DFA               1   46.626 76.626
## - spread1           1   57.434 87.434
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=63.55
## status ~ MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + 
##     MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 + 
##     MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - Shimmer.APQ3      1   33.808 61.808
## - Shimmer.DDA       1   33.832 61.832
## - HNR               1   33.874 61.874
## - MDVP.Jitter...    1   33.917 61.917
## - MDVP.PPQ          1   34.285 62.285
## - NHR               1   34.509 62.509
## - MDVP.APQ          1   35.110 63.110
## - MDVP.Shimmer.dB.  1   35.212 63.212
## <none>                  33.547 63.547
## - Shimmer.APQ5      1   36.132 64.132
## - MDVP.Shimmer      1   36.196 64.196
## - Jitter.DDP        1   36.623 64.623
## - MDVP.Jitter.Abs.  1   38.505 66.505
## - DFA               1   46.722 74.722
## - spread1           1   58.958 86.958
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=61.81
## status ~ MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + 
##     MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ + 
##     Shimmer.DDA + NHR + HNR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - HNR               1   34.153 60.153
## - MDVP.Jitter...    1   34.162 60.162
## - MDVP.PPQ          1   34.529 60.529
## - NHR               1   34.847 60.847
## - MDVP.Shimmer.dB.  1   35.405 61.405
## <none>                  33.808 61.808
## - MDVP.APQ          1   36.069 62.069
## - Shimmer.APQ5      1   36.458 62.458
## - Jitter.DDP        1   36.776 62.776
## - MDVP.Shimmer      1   37.051 63.051
## - MDVP.Jitter.Abs.  1   38.543 64.543
## - Shimmer.DDA       1   39.258 65.258
## - DFA               1   46.722 72.722
## - spread1           1   58.961 84.961
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=60.15
## status ~ MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + 
##     MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ + 
##     Shimmer.DDA + NHR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - MDVP.Jitter...    1   34.488 58.488
## - MDVP.PPQ          1   35.022 59.022
## - NHR               1   35.114 59.114
## - MDVP.Shimmer.dB.  1   35.941 59.941
## <none>                  34.153 60.153
## - Jitter.DDP        1   37.108 61.108
## - Shimmer.APQ5      1   37.142 61.142
## - MDVP.APQ          1   38.018 62.018
## - MDVP.Jitter.Abs.  1   38.629 62.629
## - MDVP.Shimmer      1   38.780 62.780
## - Shimmer.DDA       1   42.548 66.548
## - DFA               1   49.184 73.184
## - spread1           1   60.020 84.020
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=58.49
## status ~ MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + 
##     MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + 
##     NHR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - NHR               1   35.358 57.358
## - MDVP.Shimmer.dB.  1   35.998 57.998
## <none>                  34.488 58.488
## - Shimmer.APQ5      1   37.619 59.619
## - Jitter.DDP        1   37.777 59.777
## - MDVP.APQ          1   38.050 60.050
## - MDVP.Shimmer      1   38.780 60.780
## - MDVP.PPQ          1   39.000 61.000
## - MDVP.Jitter.Abs.  1   39.272 61.272
## - Shimmer.DDA       1   42.561 64.561
## - DFA               1   49.426 71.426
## - spread1           1   60.366 82.366
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=57.36
## status ~ MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + 
##     MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + 
##     DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## <none>                  35.358 57.358
## - MDVP.Shimmer.dB.  1   37.395 57.395
## - MDVP.APQ          1   38.509 58.509
## - Shimmer.APQ5      1   38.537 58.537
## - MDVP.Shimmer      1   39.054 59.054
## - MDVP.Jitter.Abs.  1   39.369 59.369
## - MDVP.PPQ          1   40.153 60.153
## - Jitter.DDP        1   40.348 60.348
## - Shimmer.DDA       1   42.827 62.827
## - DFA               1   50.922 70.922
## - spread1           1   61.944 81.944
## 
## Call:  glm(formula = status ~ MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + 
##     MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ + 
##     Shimmer.DDA + DFA + spread1, family = binomial, data = pd2, 
##     subset = train)
## 
## Coefficients:
##      (Intercept)  MDVP.Jitter.Abs.          MDVP.PPQ        Jitter.DDP  
##        3.101e+00        -1.755e+05        -3.916e+03         9.114e+02  
##     MDVP.Shimmer  MDVP.Shimmer.dB.      Shimmer.APQ5          MDVP.APQ  
##        4.540e+03         7.531e+01        -1.346e+03        -1.218e+03  
##      Shimmer.DDA               DFA           spread1  
##       -1.851e+03         4.321e+01         5.612e+00  
## 
## Degrees of Freedom: 97 Total (i.e. Null);  87 Residual
## Null Deviance:       109.1 
## Residual Deviance: 35.36     AIC: 57.36

At first glance, the p-values are all very large, except for DFA, which is smaller but still pretty large.

contrasts(pd2[,"status"])
##   1
## 0 0
## 1 1
glm_probs = predict(logist_reg_fit, pd2[test,], type="response")
head(glm_probs)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6 
##      0.3258190      0.9997452      0.7062405      1.0000000      1.0000000 
## phon_R01_S05_3 
##      1.0000000
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6 
##              0              1              1              1              1 
## phon_R01_S05_3 
##              1
length(glm_pred)
## [1] 97
length(y.test)
## [1] 97
table(glm_pred, y.test)
##         y.test
## glm_pred  0  1
##        0 17 11
##        1  7 62

Classification Error

mean(glm_pred!=y.test)
## [1] 0.185567

Accuracy

mean(glm_pred==y.test)
## [1] 0.814433

Using Predictors Chosen By Lasso MDVP.Fo.Hz.+MDVP.Fhi.Hz+MDVP.APQ+DFA+spread1+spread2+D2

colnames(pd2)
##  [1] "MDVP.Fo.Hz."      "MDVP.Fhi.Hz."     "MDVP.Flo.Hz."     "MDVP.Jitter..."  
##  [5] "MDVP.Jitter.Abs." "MDVP.RAP"         "MDVP.PPQ"         "Jitter.DDP"      
##  [9] "MDVP.Shimmer"     "MDVP.Shimmer.dB." "Shimmer.APQ3"     "Shimmer.APQ5"    
## [13] "MDVP.APQ"         "Shimmer.DDA"      "NHR"              "HNR"             
## [17] "RPDE"             "DFA"              "spread1"          "spread2"         
## [21] "D2"               "PPE"              "status"
logist_reg_fit2=glm(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=pd2, family=binomial, subset=train)
summary(logist_reg_fit)
## 
## Call:
## glm(formula = status ~ ., family = binomial, data = pd2, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.25209   0.00000   0.00454   0.14252   1.50515  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      -1.857e+01  4.922e+01  -0.377   0.7059  
## MDVP.Fo.Hz.       3.087e-03  5.866e-02   0.053   0.9580  
## MDVP.Fhi.Hz.      3.238e-03  1.513e-02   0.214   0.8305  
## MDVP.Flo.Hz.     -1.936e-02  3.836e-02  -0.505   0.6138  
## MDVP.Jitter...   -1.497e+03  2.542e+03  -0.589   0.5561  
## MDVP.Jitter.Abs. -2.389e+05  2.527e+05  -0.945   0.3446  
## MDVP.RAP         -6.776e+04  2.884e+05  -0.235   0.8143  
## MDVP.PPQ         -1.864e+03  3.768e+03  -0.495   0.6207  
## Jitter.DDP        2.365e+04  9.598e+04   0.246   0.8053  
## MDVP.Shimmer      5.099e+03  4.432e+03   1.151   0.2499  
## MDVP.Shimmer.dB.  8.412e+01  1.231e+02   0.683   0.4944  
## Shimmer.APQ3      1.352e+05  3.108e+05   0.435   0.6636  
## Shimmer.APQ5     -2.056e+03  1.710e+03  -1.202   0.2292  
## MDVP.APQ         -1.068e+03  1.249e+03  -0.855   0.3925  
## Shimmer.DDA      -4.701e+04  1.037e+05  -0.453   0.6502  
## NHR               8.646e+01  1.023e+02   0.845   0.3982  
## HNR               4.297e-01  5.681e-01   0.756   0.4494  
## RPDE              4.853e-01  9.039e+00   0.054   0.9572  
## DFA               5.860e+01  2.999e+01   1.954   0.0507 .
## spread1           6.140e+00  6.336e+00   0.969   0.3326  
## spread2          -3.070e+00  1.360e+01  -0.226   0.8214  
## D2                9.834e-01  2.602e+00   0.378   0.7055  
## PPE               5.662e+00  6.918e+01   0.082   0.9348  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 109.105  on 97  degrees of freedom
## Residual deviance:  32.917  on 75  degrees of freedom
## AIC: 78.917
## 
## Number of Fisher Scoring iterations: 11
glm_probs = predict(logist_reg_fit2, pd2[test,], type="response")
head(glm_probs)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6 
##      0.6355746      0.8711462      0.7096464      0.9999748      0.9998081 
## phon_R01_S05_3 
##      0.9992184
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6 
##              1              1              1              1              1 
## phon_R01_S05_3 
##              1
table(glm_pred, y.test)
##         y.test
## glm_pred  0  1
##        0 15  8
##        1  9 65
"Error"
## [1] "Error"
mean(glm_pred!=y.test)
## [1] 0.1752577
"Accuracy"
## [1] "Accuracy"
mean(glm_pred==y.test)
## [1] 0.8247423

The model improves

LDA

lda_fit = lda(status~., data=pd2[,-5], subset=train)
lda_fit
## Call:
## lda(status ~ ., data = pd2[, -5], subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.244898 0.755102 
## 
## Group means:
##   MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...    MDVP.RAP    MDVP.PPQ
## 0    180.8565     209.1609     136.2219    0.004001667 0.001983750 0.002148750
## 1    142.4056     189.3842     108.2484    0.007763919 0.004226351 0.004345676
##   Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5   MDVP.APQ
## 0  0.0059525   0.01721833        0.1580000  0.009189167   0.01038167 0.01322583
## 1  0.0126800   0.03571662        0.3466757  0.018682162   0.02135635 0.02978635
##   Shimmer.DDA        NHR      HNR      RPDE       DFA   spread1   spread2
## 0  0.02756625 0.01295083 24.76229 0.4501494 0.6909354 -6.676518 0.1636713
## 1  0.05604608 0.03406108 20.75650 0.5152109 0.7310390 -5.278859 0.2474796
##         D2       PPE
## 0 2.132207 0.1312950
## 1 2.432709 0.2400174
## 
## Coefficients of linear discriminants:
##                            LD1
## MDVP.Fo.Hz.      -3.728117e-03
## MDVP.Fhi.Hz.      6.644095e-04
## MDVP.Flo.Hz.     -6.176427e-03
## MDVP.Jitter...   -4.139984e+02
## MDVP.RAP         -7.502706e+03
## MDVP.PPQ         -7.046188e+02
## Jitter.DDP        2.817771e+03
## MDVP.Shimmer      1.670053e+02
## MDVP.Shimmer.dB.  8.091584e+00
## Shimmer.APQ3     -4.016317e+04
## Shimmer.APQ5     -9.464667e+01
## MDVP.APQ         -3.623173e+01
## Shimmer.DDA       1.331237e+04
## NHR              -3.928908e-01
## HNR               2.446249e-02
## RPDE             -2.670761e+00
## DFA               9.791429e+00
## spread1           1.458082e+00
## spread2           1.285339e+00
## D2                4.694775e-01
## PPE              -1.465356e+00
plot(lda_fit) #produces plots of the linear discriminants

lda_pred = predict(lda_fit, pd2[test,])
names(lda_pred) #the names are  class, contains LDA’s predictions
## [1] "class"     "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.test) #predictions vs real values of test data
##          y.test
## lda_class  0  1
##         0 17  7
##         1  7 66
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
##   1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.test)
## [1] 0.8556701
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.test)
## [1] 0.1443299
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100

sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 24
sum(lda_pred$posterior[,1]<0.5)
## [1] 73
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2

set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
##  1  2  3  4  5  6  7  8  9 10 
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold

for (j in 1:k) #j refers to folds
{
  lda_fit = lda(status~., data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold

  lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
  lda_class = lda_pred$class #the class predictions of our fitted model on test data
  error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}

cv_error=mean(error)
cv_error
## [1] 0.1172264

Using Predictors Selected by Lasso status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2

lda_fit = lda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=pd2[,-5], subset=train)
lda_fit
## Call:
## lda(status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.APQ + DFA + spread1 + 
##     spread2 + D2, data = pd2[, -5], subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.244898 0.755102 
## 
## Group means:
##   MDVP.Fo.Hz. MDVP.Fhi.Hz.   MDVP.APQ       DFA   spread1   spread2       D2
## 0    180.8565     209.1609 0.01322583 0.6909354 -6.676518 0.1636713 2.132207
## 1    142.4056     189.3842 0.02978635 0.7310390 -5.278859 0.2474796 2.432709
## 
## Coefficients of linear discriminants:
##                        LD1
## MDVP.Fo.Hz.  -0.0155899826
## MDVP.Fhi.Hz.  0.0005662592
## MDVP.APQ     -0.3739268589
## DFA           5.6755575641
## spread1       0.4777103752
## spread2       0.5062658518
## D2            1.2190753822
plot(lda_fit) #produces plots of the linear discriminants

lda_pred = predict(lda_fit, pd2[test,])
names(lda_pred) #the names are  class, contains LDA’s predictions
## [1] "class"     "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.test) #predictions vs real values of test data
##          y.test
## lda_class  0  1
##         0 14  6
##         1 10 67
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
##   1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.test)
## [1] 0.8350515
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.test)
## [1] 0.1649485
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100

sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 20
sum(lda_pred$posterior[,1]<0.5)
## [1] 77
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2

set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
##  1  2  3  4  5  6  7  8  9 10 
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold

for (j in 1:k) #j refers to folds
{
  lda_fit = lda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold

  lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
  lda_class = lda_pred$class #the class predictions of our fitted model on test data
  error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}

cv_error=mean(error)
cv_error
## [1] 0.1498723

QDA

qda_fit=qda(status~., data=pd2, subset = train)
qda_fit
## Call:
## qda(status ~ ., data = pd2, subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.244898 0.755102 
## 
## Group means:
##   MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.Jitter.Abs.
## 0    180.8565     209.1609     136.2219    0.004001667     2.516667e-05
## 1    142.4056     189.3842     108.2484    0.007763919     5.581081e-05
##      MDVP.RAP    MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 0 0.001983750 0.002148750  0.0059525   0.01721833        0.1580000  0.009189167
## 1 0.004226351 0.004345676  0.0126800   0.03571662        0.3466757  0.018682162
##   Shimmer.APQ5   MDVP.APQ Shimmer.DDA        NHR      HNR      RPDE       DFA
## 0   0.01038167 0.01322583  0.02756625 0.01295083 24.76229 0.4501494 0.6909354
## 1   0.02135635 0.02978635  0.05604608 0.03406108 20.75650 0.5152109 0.7310390
##     spread1   spread2       D2       PPE
## 0 -6.676518 0.1636713 2.132207 0.1312950
## 1 -5.278859 0.2474796 2.432709 0.2400174
qda_class=predict(qda_fit, pd2[test,])$class
table(qda_class, y.test)
##          y.test
## qda_class  0  1
##         0  7  1
##         1 17 72
"Accuracy"
## [1] "Accuracy"
mean(qda_class==y.test)
## [1] 0.814433
"Classification Error"
## [1] "Classification Error"
mean(qda_class!=y.test)
## [1] 0.185567
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100

Using the validation set approach, LDA performs best

#CV for QDA
k=10 #number of folds
Dataset=pd2

set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
##  1  2  3  4  5  6  7  8  9 10 
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold

for (j in 1:k) #j refers to folds
{
  qda_fit = qda(status~., data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold

  qda_class=predict(qda_fit, Dataset[folds==j,])$class #test data, current fold
  error=append(error, mean(qda_class!=Dataset[folds==j, "status"])) #cv error
}

cv_error=mean(error)
cv_error
## [1] 0.1312084

Using Predictors Selected by Lasso

qda_fit=qda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=pd2, subset = train)
qda_fit
## Call:
## qda(status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.APQ + DFA + spread1 + 
##     spread2 + D2, data = pd2, subset = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.244898 0.755102 
## 
## Group means:
##   MDVP.Fo.Hz. MDVP.Fhi.Hz.   MDVP.APQ       DFA   spread1   spread2       D2
## 0    180.8565     209.1609 0.01322583 0.6909354 -6.676518 0.1636713 2.132207
## 1    142.4056     189.3842 0.02978635 0.7310390 -5.278859 0.2474796 2.432709
qda_class=predict(qda_fit, pd2[test,])$class
table(qda_class, y.test)
##          y.test
## qda_class  0  1
##         0 19 13
##         1  5 60
"Accuracy"
## [1] "Accuracy"
mean(qda_class==y.test)
## [1] 0.814433
"Classification Error"
## [1] "Classification Error"
mean(qda_class!=y.test)
## [1] 0.185567
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
#CV for QDA
k=10 #number of folds
Dataset=pd2

set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
##  1  2  3  4  5  6  7  8  9 10 
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold

for (j in 1:k) #j refers to folds
{
  qda_fit = qda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold

  qda_class=predict(qda_fit, Dataset[folds==j,])$class #test data, current fold
  error=append(error, mean(qda_class!=Dataset[folds==j, "status"])) #cv error
}

cv_error=mean(error)
cv_error
## [1] 0.1384934

Trees

tree.pd=tree(pd[,"status"]~., pd2[,1:22], subset=train)
summary(tree.pd)
## 
## Classification tree:
## tree(formula = pd[, "status"] ~ ., data = pd2[, 1:22], subset = train)
## Variables actually used in tree construction:
## [1] "spread1"      "MDVP.Fhi.Hz." "MDVP.APQ"     "MDVP.Flo.Hz."
## Number of terminal nodes:  7 
## Residual mean deviance:  0.2173 = 19.78 / 91 
## Misclassification error rate: 0.06122 = 6 / 98
plot(tree.pd)
text(tree.pd, pretty=0)

Spread1 seems to be the most important predictor here.

tree.pred=predict(tree.pd, pd2[test,], type="class")
table(tree.pred, y.test)
##          y.test
## tree.pred  0  1
##         0 14  6
##         1 10 67
"Accuracy"
## [1] "Accuracy"
mean(tree.pred==y.test)
## [1] 0.8350515
"Error"
## [1] "Error"
mean(tree.pred!=y.test)
## [1] 0.1649485

Pruning

The test error rate seems good. We consider pruning the tree.

set.seed(4706)
cv.pd=cv.tree(tree.pd, FUN=prune.misclass)
names(cv.pd)
## [1] "size"   "dev"    "k"      "method"
cv.pd
## $size
## [1] 7 6 2 1
## 
## $dev
## [1] 20 19 15 26
## 
## $k
## [1]  -Inf  0.00  1.25 13.00
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

$dev is the CV error rate. It seems to be lowest for tree size 2.

par(mfrow=c(1,2))
plot(cv.pd$size, cv.pd$dev, type="b", ylab="cv error")
plot(cv.pd$k, cv.pd$dev, type="b", ylab = "cv error")

Prune the tree and obtain the 2-node tree

pruned.pd=prune.misclass(tree.pd, best=2)
plot(pruned.pd)
text(pruned.pd, pretty=0)

pruned.pred=predict(pruned.pd, pd2[test,], type="class")
table(pruned.pred, y.test)
##            y.test
## pruned.pred  0  1
##           0 14  6
##           1 10 67
"Accuracy"
## [1] "Accuracy"
mean(pruned.pred==y.test)
## [1] 0.8350515
"Error"
## [1] "Error"
mean(pruned.pred!=y.test)
## [1] 0.1649485

The unpruned tree actually performed better than the pruned tree.

Bagging and Random Forest

BAGGING

#bagging
set.seed(4706)
bag.pd=randomForest(pd2[,"status"]~., pd2[,1:22], subset=train, mtry=13, importance=TRUE)
#mtry: Number of variables randomly sampled as candidates at each split. Note that the default values are different for classification (sqrt(p) where p is number of variables in x) and regression (p/3)
bag.pd
## 
## Call:
##  randomForest(formula = pd2[, "status"] ~ ., data = pd2[, 1:22],      mtry = 13, importance = TRUE, subset = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 13
## 
##         OOB estimate of  error rate: 13.27%
## Confusion matrix:
##    0  1 class.error
## 0 15  9  0.37500000
## 1  4 70  0.05405405
plot(bag.pd)

yhat.bag=predict(bag.pd, newdata = pd2[test,])
plot(yhat.bag, y.test, xlab="bagging pred", ylab="actual")

"Error"
## [1] "Error"
mean(yhat.bag!=y.test)
## [1] 0.07216495
"Accuracy"
## [1] "Accuracy"
mean(yhat.bag==y.test)
## [1] 0.9278351

The bagged tree shows the highest accuracy so far, but the OOB estimate of error rate is 13.27%

RANDOM FOREST

set.seed(4706)
rf.pd=randomForest(pd2[,"status"]~., pd2[,1:22], subset=train, mtry=6, importance=TRUE)
yhat.rf=predict(rf.pd, newdata = pd2[test,])
rf.pd
## 
## Call:
##  randomForest(formula = pd2[, "status"] ~ ., data = pd2[, 1:22],      mtry = 6, importance = TRUE, subset = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 13.27%
## Confusion matrix:
##    0  1 class.error
## 0 15  9  0.37500000
## 1  4 70  0.05405405
plot(rf.pd)

"Error"
## [1] "Error"
mean(yhat.rf!=y.test)
## [1] 0.08247423
"Accuracy"
## [1] "Accuracy"
mean(yhat.rf==y.test)
## [1] 0.9175258

Random Forest performed almost as well as bagging

importance(rf.pd)
##                           0         1 MeanDecreaseAccuracy MeanDecreaseGini
## MDVP.Fo.Hz.      11.4448187 11.104339            14.038699        4.5970648
## MDVP.Fhi.Hz.      6.6755381  4.530633             7.030507        1.7144558
## MDVP.Flo.Hz.      3.3536820  1.920882             3.439286        1.0143408
## MDVP.Jitter...   -1.0025620  4.510267             3.801288        0.9789375
## MDVP.Jitter.Abs.  2.1330052  4.343872             4.835862        1.1343849
## MDVP.RAP          0.4450338  5.115567             4.859047        1.0574212
## MDVP.PPQ         -0.6501283  4.160240             3.312010        1.1718007
## Jitter.DDP        0.2283893  6.492795             6.314745        1.0522543
## MDVP.Shimmer      1.4712924  7.059461             6.899473        0.9189805
## MDVP.Shimmer.dB.  0.6454145  7.370814             7.265857        0.8806220
## Shimmer.APQ3     -1.1320049  4.214220             3.730090        0.7356575
## Shimmer.APQ5     -1.5924925  6.229775             5.108339        1.1045711
## MDVP.APQ          5.1574296  8.979118            10.048115        2.3779397
## Shimmer.DDA      -1.6055500  5.965143             4.968150        0.6076920
## NHR               1.2888463  2.548764             2.943565        1.4794231
## HNR              -0.3993738  3.070284             2.072825        0.7257339
## RPDE             -3.3861256  4.774894             2.629008        0.9505619
## DFA               2.5724624  4.559135             4.793670        1.3004174
## spread1          10.1666590 13.757678            14.840937        5.8751827
## spread2           5.5245157  6.734957             7.791857        1.6422095
## D2                3.9574381  2.227187             3.997375        1.0211105
## PPE               6.9094008 10.733483            11.188341        3.6558096

Boosting

Boosting

library(gbm)
## Loaded gbm 2.1.8
#gbm() with the option distribution="gaussian" since this is a regression problem; if it were a bi- nary classification problem, we would use distribution="bernoulli". The argument n.trees=5000 indicates that we want 5000 trees, and the option interaction.depth=4 limits the depth of each tree

set.seed(4706)

boost.pd=gbm(pd2[train,"status"]~., pd2[train,1:22], distribution = "gaussian", n.trees=5000, interaction.depth=4)
#shrinkage=0.001 by default
#can change it
summary(boost.pd)

##                               var     rel.inf
## spread1                   spread1 20.18000535
## MDVP.Fo.Hz.           MDVP.Fo.Hz. 15.50987695
## spread2                   spread2  9.47971138
## DFA                           DFA  6.80467690
## MDVP.Fhi.Hz.         MDVP.Fhi.Hz.  6.25492044
## MDVP.APQ                 MDVP.APQ  5.82610604
## D2                             D2  4.17080580
## MDVP.Shimmer.dB. MDVP.Shimmer.dB.  3.79637001
## MDVP.RAP                 MDVP.RAP  3.39939800
## Shimmer.APQ5         Shimmer.APQ5  3.29340672
## RPDE                         RPDE  2.78656052
## NHR                           NHR  2.45493065
## MDVP.PPQ                 MDVP.PPQ  2.21570361
## MDVP.Jitter...     MDVP.Jitter...  2.21299478
## MDVP.Shimmer         MDVP.Shimmer  2.17943916
## MDVP.Flo.Hz.         MDVP.Flo.Hz.  2.17164003
## MDVP.Jitter.Abs. MDVP.Jitter.Abs.  1.91427694
## PPE                           PPE  1.88288846
## HNR                           HNR  1.68732825
## Shimmer.APQ3         Shimmer.APQ3  1.64984109
## Jitter.DDP             Jitter.DDP  0.11166319
## Shimmer.DDA           Shimmer.DDA  0.01745572
par(mfrow=c(1,2))
plot(boost.pd, i="spread1")

plot(boost.pd, i="MDVP.Fo.Hz.")

Analysis on Averaged Dataset (observations = individuals)

pdavg$status=as.factor(pdavg$status)
summary(pdavg)
##    Group.1           MDVP.Fo.Hz.      MDVP.Fhi.Hz.    MDVP.Flo.Hz.   
##  Length:32          Min.   : 97.94   Min.   :107.8   Min.   : 66.23  
##  Class :character   1st Qu.:118.12   1st Qu.:140.4   1st Qu.: 93.42  
##  Mode  :character   Median :150.10   Median :197.2   Median :104.58  
##                     Mean   :153.86   Mean   :195.6   Mean   :116.27  
##                     3rd Qu.:183.74   3rd Qu.:226.5   3rd Qu.:128.35  
##                     Max.   :243.81   Max.   :393.9   Max.   :222.12  
##  MDVP.Jitter...     MDVP.Jitter.Abs.       MDVP.RAP           MDVP.PPQ       
##  Min.   :0.002147   Min.   :9.167e-06   Min.   :0.000925   Min.   :0.001133  
##  1st Qu.:0.003864   1st Qu.:2.500e-05   1st Qu.:0.001782   1st Qu.:0.002066  
##  Median :0.004832   Median :3.536e-05   Median :0.002426   Median :0.002828  
##  Mean   :0.006156   Mean   :4.375e-05   Mean   :0.003271   Mean   :0.003407  
##  3rd Qu.:0.006831   3rd Qu.:5.208e-05   3rd Qu.:0.003833   3rd Qu.:0.003731  
##  Max.   :0.020787   Max.   :1.600e-04   Max.   :0.012718   Max.   :0.012237  
##    Jitter.DDP        MDVP.Shimmer     MDVP.Shimmer.dB.   Shimmer.APQ3     
##  Min.   :0.002780   Min.   :0.01080   Min.   :0.09567   Min.   :0.005383  
##  1st Qu.:0.005348   1st Qu.:0.01700   1st Qu.:0.16094   1st Qu.:0.008970  
##  Median :0.007278   Median :0.02355   Median :0.21708   Median :0.013267  
##  Mean   :0.009812   Mean   :0.02941   Mean   :0.27874   Mean   :0.015527  
##  3rd Qu.:0.011500   3rd Qu.:0.03656   3rd Qu.:0.33029   3rd Qu.:0.019817  
##  Max.   :0.038157   Max.   :0.07843   Max.   :0.87114   Max.   :0.037554  
##   Shimmer.APQ5         MDVP.APQ         Shimmer.DDA           NHR          
##  Min.   :0.006758   Min.   :0.008195   Min.   :0.01615   Min.   :0.001495  
##  1st Qu.:0.010235   1st Qu.:0.013479   1st Qu.:0.02692   1st Qu.:0.005696  
##  Median :0.013642   Median :0.018527   Median :0.03979   Median :0.012793  
##  Mean   :0.017677   Mean   :0.023763   Mean   :0.04658   Mean   :0.024273  
##  3rd Qu.:0.020677   3rd Qu.:0.028656   3rd Qu.:0.05945   3rd Qu.:0.025694  
##  Max.   :0.050831   Max.   :0.081151   Max.   :0.11266   Max.   :0.174122  
##       HNR        status      RPDE             DFA            spread1      
##  Min.   :12.06   0: 8   Min.   :0.3275   Min.   :0.6383   Min.   :-7.590  
##  1st Qu.:19.60   1:24   1st Qu.:0.4281   1st Qu.:0.6689   1st Qu.:-6.275  
##  Median :22.17          Median :0.4858   Median :0.7200   Median :-5.630  
##  Mean   :21.96          Mean   :0.4981   Mean   :0.7181   Mean   :-5.699  
##  3rd Qu.:24.81          3rd Qu.:0.5890   3rd Qu.:0.7628   3rd Qu.:-5.173  
##  Max.   :30.99          Max.   :0.6384   Max.   :0.8213   Max.   :-3.657  
##     spread2              D2             PPE         
##  Min.   :0.05518   Min.   :1.796   Min.   :0.06811  
##  1st Qu.:0.17310   1st Qu.:2.193   1st Qu.:0.15685  
##  Median :0.22778   Median :2.324   Median :0.20538  
##  Mean   :0.22475   Mean   :2.372   Mean   :0.20532  
##  3rd Qu.:0.28154   3rd Qu.:2.536   3rd Qu.:0.23381  
##  Max.   :0.35930   Max.   :3.141   Max.   :0.39514

Plotting for Visualization

pairs(pdavg[,-1], col=pdavg$status)

#red=parkinson's
plot(pdavg$PPE, pdavg$spread1, col=pdavg$status)

plot(pdavg$PPE, pdavg$spread1, col=pdavg$status)

pd2avg=pdavg[,-1]
rownames(pd2avg) = pdavg[,1]
pd2avg=pd2avg[,-17]
par(mfrow=c(3,3))
for (i in colnames(pd2avg))
  {hist(pd2avg[,i], main=i)}

par(mfrow=c(1,3))
for (i in colnames(pd2avg))
  {boxplot(pd2avg[,i], main=i)}

Fewer outliers after averaging

outl=boxplot.stats(pd2avg[,"MDVP.Flo.Hz."])$out
ind=which(pd2avg[,"MDVP.Flo.Hz."] %in% c(outl))
pd2avg[c(ind),]
##              MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## phon_R01_S07    200.2668     210.8843     194.3662    0.002163333
## phon_R01_S10    243.8143     254.2805     222.1150    0.002390000
##              MDVP.Jitter.Abs. MDVP.RAP    MDVP.PPQ  Jitter.DDP MDVP.Shimmer
## phon_R01_S07     9.666667e-06 0.001175 0.001281667 0.003523333   0.01080333
## phon_R01_S10     9.166667e-06 0.001285 0.001486667 0.003850000   0.01530833
##              MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5   MDVP.APQ Shimmer.DDA
## phon_R01_S07       0.09566667  0.005383333  0.006870000 0.00819500  0.01614833
## phon_R01_S10       0.13700000  0.008641667  0.009243333 0.01073333  0.02592167
##                      NHR      HNR      RPDE       DFA   spread1   spread2
## phon_R01_S07 0.001495000 30.99217 0.3955782 0.7414823 -7.589537 0.1730488
## phon_R01_S10 0.005421667 24.61467 0.4517002 0.6382515 -7.105562 0.1298533
##                    D2        PPE
## phon_R01_S07 1.795701 0.06811333
## phon_R01_S10 2.298464 0.09839017
pdavg[c(ind),]
##        Group.1 MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 6 phon_R01_S07    200.2668     210.8843     194.3662    0.002163333
## 8 phon_R01_S10    243.8143     254.2805     222.1150    0.002390000
##   MDVP.Jitter.Abs. MDVP.RAP    MDVP.PPQ  Jitter.DDP MDVP.Shimmer
## 6     9.666667e-06 0.001175 0.001281667 0.003523333   0.01080333
## 8     9.166667e-06 0.001285 0.001486667 0.003850000   0.01530833
##   MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5   MDVP.APQ Shimmer.DDA         NHR
## 6       0.09566667  0.005383333  0.006870000 0.00819500  0.01614833 0.001495000
## 8       0.13700000  0.008641667  0.009243333 0.01073333  0.02592167 0.005421667
##        HNR status      RPDE       DFA   spread1   spread2       D2        PPE
## 6 30.99217      0 0.3955782 0.7414823 -7.589537 0.1730488 1.795701 0.06811333
## 8 24.61467      0 0.4517002 0.6382515 -7.105562 0.1298533 2.298464 0.09839017

Both healthy, not going to remove (will lost 2 out of 8)

Unsupervised Learning Methods for Visualization

pca_pdavg=prcomp(pd2avg, scale=TRUE)
str(pca_pdavg)
## List of 5
##  $ sdev    : num [1:22] 3.686 1.678 1.261 1.174 0.955 ...
##  $ rotation: num [1:22, 1:22] 0.0613 0.00712 0.07747 -0.25043 -0.23769 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##   .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:22] 1.54e+02 1.96e+02 1.16e+02 6.16e-03 4.37e-05 ...
##   ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##  $ scale   : Named num [1:22] 4.09e+01 6.28e+01 3.76e+01 4.26e-03 3.12e-05 ...
##   ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
##  $ x       : num [1:32, 1:22] -4.493 0.505 1.423 -2.501 1.449 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:32] "phon_R01_S01" "phon_R01_S02" "phon_R01_S04" "phon_R01_S05" ...
##   .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
ggbiplot(pca_pdavg)

biplot(pca_pdavg, scale=0)

ggbiplot(pca_pdavg, labels=rownames(pd2avg))

disease_stat=ifelse(pdavg$status=="1", "PD", "HLT")
ggbiplot(pca_pdavg,ellipse=TRUE, groups=disease_stat)

Clustering seems to make a bit more sense here

head(pca_pdavg$rotation)
##                           PC1         PC2         PC3        PC4         PC5
## MDVP.Fo.Hz.       0.061299358  0.54841845 -0.04272008  0.1334530 -0.01731036
## MDVP.Fhi.Hz.      0.007124914  0.42368002  0.29295524 -0.1465257 -0.38838306
## MDVP.Flo.Hz.      0.077473404  0.36946891 -0.39066813  0.1664252 -0.08125467
## MDVP.Jitter...   -0.250434358  0.06358702 -0.10210604 -0.2735671 -0.09124493
## MDVP.Jitter.Abs. -0.237685724 -0.11220925 -0.08416312 -0.3426294 -0.02942348
## MDVP.RAP         -0.245194803  0.09518996 -0.11760315 -0.2968125 -0.02973169
##                          PC6         PC7          PC8         PC9        PC10
## MDVP.Fo.Hz.       0.06451036  0.10129636  0.250782418  0.17629433 -0.38212957
## MDVP.Fhi.Hz.     -0.31046652  0.53176730 -0.238047987  0.11492100  0.12392285
## MDVP.Flo.Hz.      0.68823690  0.09051954 -0.022434181 -0.12629583  0.08220497
## MDVP.Jitter...    0.02412989 -0.03705645 -0.001180571  0.05092826  0.01474532
## MDVP.Jitter.Abs.  0.06793527 -0.04169890 -0.045283585 -0.06316520 -0.23387602
## MDVP.RAP          0.05694397 -0.09228001  0.022900453  0.08702280  0.02192711
##                        PC11        PC12        PC13         PC14         PC15
## MDVP.Fo.Hz.      -0.4772299  0.04323745 -0.37155927  0.070051482 -0.006036326
## MDVP.Fhi.Hz.      0.2795802  0.09946360  0.06378219 -0.055530710 -0.013558098
## MDVP.Flo.Hz.      0.3619906 -0.01154541  0.15208114 -0.077901298 -0.052741319
## MDVP.Jitter...   -0.2084222  0.08015262  0.11618340 -0.003004445 -0.183885581
## MDVP.Jitter.Abs.  0.1904564  0.20951570  0.01657499  0.334014047 -0.448111892
## MDVP.RAP         -0.1438669  0.16760632  0.08978935 -0.202068305  0.199174295
##                          PC16        PC17        PC18         PC19         PC20
## MDVP.Fo.Hz.       0.152753484 -0.14410026  0.00301326 -0.055165936 -0.036871357
## MDVP.Fhi.Hz.     -0.016256399 -0.03462657  0.03166574  0.007141266  0.001807813
## MDVP.Flo.Hz.     -0.006152586  0.02540490  0.01298393  0.036460549  0.012778431
## MDVP.Jitter...    0.341058852  0.47759418  0.20442131  0.590944110  0.047692444
## MDVP.Jitter.Abs.  0.188485433 -0.49274449  0.17221276 -0.120652132 -0.152067569
## MDVP.RAP         -0.376085648 -0.15122816 -0.06181958  0.038611573 -0.017555511
##                           PC21          PC22
## MDVP.Fo.Hz.       1.658498e-05 -7.976223e-05
## MDVP.Fhi.Hz.     -2.347411e-05 -1.093318e-05
## MDVP.Flo.Hz.     -5.062547e-06  4.774867e-06
## MDVP.Jitter...    6.122519e-04 -2.340494e-04
## MDVP.Jitter.Abs.  6.404405e-04 -4.669768e-04
## MDVP.RAP          7.050823e-01  5.695548e-02

Proprotion of Variance Explained by PC1

pr.var=pca_pdavg$sdev^2
pve=pr.var/sum(pr.var)
par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained ", ylim=c(0,1),type="b")
plot(cumsum(pve), xlab="Principal Component ", ylab=" Cumulative Proportion of Variance Explained ", ylim=c(0,1), type="b")

PC1 explains 61.8% of the variance

Clustering

Clustering with K=2

km.pd2avg=kmeans(pd2avg,2,nstart=20)
head(km.pd2avg$cluster)
## phon_R01_S01 phon_R01_S02 phon_R01_S04 phon_R01_S05 phon_R01_S06 phon_R01_S07 
##            1            1            1            1            1            2
head(pdavg$status)
## [1] 1 1 1 1 1 0
## Levels: 0 1
s=ifelse(pdavg$status==1, 1, 2) #so the colors match
head(s) 
## [1] 1 1 1 1 1 2
par(mfrow=c(1,2))
for (i in colnames(pd2avg))
{plot(pd2avg[,i], col=(km.pd2avg$cluster), main="K-Means Clustering; K=2", xlab="", ylab=i, pch=20, cex=2)
plot(pd2avg[,i], col=(s), main="By Status; black=PD", xlab="", ylab=i, pch=20, cex=2)}

Clustering seems to make a lot more sense now. General trends of the 2 clusters look similar to the actual response classes.

Training and Test Sets

Since there are only 8 healthy individuals, I make sure to include a sufficient number in the training and test sets, so I split them into healthy vs diseased, sample from each, and then combine them into training and test sets.

pd2avg=cbind(pd2avg, pdavg[,"status"])
colnames(pd2avg)=c("MDVP.Fo.Hz.",      "MDVP.Fhi.Hz." ,    "MDVP.Flo.Hz."  ,   "MDVP.Jitter..." ,  "MDVP.Jitter.Abs." ,"MDVP.RAP"     ,   "MDVP.PPQ"    ,     "Jitter.DDP"  ,     "MDVP.Shimmer"   ,  "MDVP.Shimmer.dB." ,"Shimmer.APQ3"   ,  "Shimmer.APQ5" ,   
 "MDVP.APQ"  ,       "Shimmer.DDA"  ,    "NHR" ,             "HNR"    ,          "RPDE"    ,         "DFA"     ,        
"spread1"     ,     "spread2"      ,    "D2"        ,       "PPE"      ,        "status")
pd_dis_idx=which(pd2avg$status==1) #indeces of PD individuals
pd_health_idx=which(pd2avg$status==0) #indeces of healthy individuals
#length(pd_health_idx)
#length(pd_dis_idx)
set.seed(4706)

#length(pd_dis_idx)
#length(pd_health_idx)

#total: 24 diseased, 8 healthy

#12 diseased test
#4 healthy test
#12 diseased train
#4 healthy train

test_health=sample(pd_health_idx, 4, replace = FALSE) #sample from indeces
#test_health

test_dis=sample(pd_dis_idx, 12, replace = FALSE) #sample from indeces
#test_dis

testavg=append(test_dis, test_health) #test set

tot=1:nrow(pd2avg)

trainavg=tot[-testavg] #training set
#trainavg
#testavg

y.trainavg=pd2avg[trainavg, "status"]
y.testavg=pd2avg[testavg, "status"]

y.trainavg
##  [1] 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0
## Levels: 0 1
y.testavg
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
## Levels: 0 1
#trainavg
#testavg

Best Subset Selection

regfit_fullavg=regsubsets(status~., data=pd2avg, subset=train, nvmax=23) #nvmax=p
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 7 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 15
## Warning in leaps.exhaustive(a, really.big): XHAUST returned error code -999
reg_full_sumavg=summary(regfit_fullavg) #returns R^2, adjusted R^2, AIC, BIC, Cp
reg_full_sumavg
## Subset selection object
## Call: regsubsets.formula(status ~ ., data = pd2avg, subset = train, 
##     nvmax = 23)
## 22 Variables  (and intercept)
##                  Forced in Forced out
## MDVP.Fo.Hz.          FALSE      FALSE
## MDVP.Fhi.Hz.         FALSE      FALSE
## MDVP.Flo.Hz.         FALSE      FALSE
## MDVP.Jitter...       FALSE      FALSE
## MDVP.Jitter.Abs.     FALSE      FALSE
## MDVP.RAP             FALSE      FALSE
## MDVP.PPQ             FALSE      FALSE
## Jitter.DDP           FALSE      FALSE
## MDVP.Shimmer         FALSE      FALSE
## MDVP.Shimmer.dB.     FALSE      FALSE
## Shimmer.APQ3         FALSE      FALSE
## Shimmer.APQ5         FALSE      FALSE
## MDVP.APQ             FALSE      FALSE
## Shimmer.DDA          FALSE      FALSE
## NHR                  FALSE      FALSE
## HNR                  FALSE      FALSE
## RPDE                 FALSE      FALSE
## DFA                  FALSE      FALSE
## spread1              FALSE      FALSE
## spread2              FALSE      FALSE
## D2                   FALSE      FALSE
## PPE                  FALSE      FALSE
## 1 subsets of each size up to 15
## Selection Algorithm: exhaustive
##           MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.Jitter.Abs.
## 1  ( 1 )  "*"         " "          " "          " "            " "             
## 2  ( 1 )  "*"         "*"          " "          " "            " "             
## 3  ( 1 )  "*"         "*"          "*"          " "            " "             
## 4  ( 1 )  "*"         "*"          "*"          "*"            " "             
## 5  ( 1 )  "*"         "*"          "*"          "*"            "*"             
## 6  ( 1 )  "*"         "*"          "*"          "*"            "*"             
## 7  ( 1 )  "*"         "*"          "*"          "*"            "*"             
## 8  ( 1 )  "*"         "*"          "*"          "*"            "*"             
## 9  ( 1 )  "*"         "*"          "*"          "*"            "*"             
## 10  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 11  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 12  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 13  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 14  ( 1 ) "*"         "*"          "*"          "*"            "*"             
## 15  ( 1 ) "*"         "*"          "*"          "*"            "*"             
##           MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 1  ( 1 )  " "      " "      " "        " "          " "             
## 2  ( 1 )  " "      " "      " "        " "          " "             
## 3  ( 1 )  " "      " "      " "        " "          " "             
## 4  ( 1 )  " "      " "      " "        " "          " "             
## 5  ( 1 )  " "      " "      " "        " "          " "             
## 6  ( 1 )  "*"      " "      " "        " "          " "             
## 7  ( 1 )  "*"      "*"      " "        " "          " "             
## 8  ( 1 )  "*"      "*"      "*"        " "          " "             
## 9  ( 1 )  "*"      "*"      "*"        "*"          " "             
## 10  ( 1 ) "*"      "*"      "*"        "*"          "*"             
## 11  ( 1 ) "*"      "*"      "*"        "*"          "*"             
## 12  ( 1 ) "*"      "*"      "*"        "*"          "*"             
## 13  ( 1 ) "*"      "*"      "*"        "*"          "*"             
## 14  ( 1 ) "*"      "*"      "*"        "*"          "*"             
## 15  ( 1 ) "*"      "*"      "*"        "*"          "*"             
##           Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR RPDE DFA
## 1  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 2  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 3  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 4  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 5  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 6  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 7  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 8  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 9  ( 1 )  " "          " "          " "      " "         " " " " " "  " "
## 10  ( 1 ) " "          " "          " "      " "         " " " " " "  " "
## 11  ( 1 ) "*"          " "          " "      " "         " " " " " "  " "
## 12  ( 1 ) "*"          "*"          " "      " "         " " " " " "  " "
## 13  ( 1 ) "*"          "*"          "*"      " "         " " " " " "  " "
## 14  ( 1 ) "*"          "*"          "*"      "*"         " " " " " "  " "
## 15  ( 1 ) "*"          "*"          "*"      "*"         "*" " " " "  " "
##           spread1 spread2 D2  PPE
## 1  ( 1 )  " "     " "     " " " "
## 2  ( 1 )  " "     " "     " " " "
## 3  ( 1 )  " "     " "     " " " "
## 4  ( 1 )  " "     " "     " " " "
## 5  ( 1 )  " "     " "     " " " "
## 6  ( 1 )  " "     " "     " " " "
## 7  ( 1 )  " "     " "     " " " "
## 8  ( 1 )  " "     " "     " " " "
## 9  ( 1 )  " "     " "     " " " "
## 10  ( 1 ) " "     " "     " " " "
## 11  ( 1 ) " "     " "     " " " "
## 12  ( 1 ) " "     " "     " " " "
## 13  ( 1 ) " "     " "     " " " "
## 14  ( 1 ) " "     " "     " " " "
## 15  ( 1 ) " "     " "     " " " "
names(reg_full_sumavg)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
names(regfit_fullavg)
##  [1] "np"        "nrbar"     "d"         "rbar"      "thetab"    "first"    
##  [7] "last"      "vorder"    "tol"       "rss"       "bound"     "nvmax"    
## [13] "ress"      "ir"        "nbest"     "lopt"      "il"        "ier"      
## [19] "xnames"    "method"    "force.in"  "force.out" "sserr"     "intercept"
## [25] "lindep"    "nullrss"   "nn"        "call"

Plotting

par(mfrow=c(2,2))

#RSS
plot(reg_full_sumavg$rss, xlab="Number of variables", ylab="RSS", type ="l")
mnrss=which.min(reg_full_sumavg$rss)
#Plotting min RSS
points(mnrss, reg_full_sumavg$adjr2[mnrss], col="red", cex=2, pch=20)

#Adjusted R^2
plot(reg_full_sumavg$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type ="l")
mxr2=which.max(reg_full_sumavg$adjr2)
#Plotting max adjusted R^2
points(mxr2, reg_full_sumavg$adjr2[mxr2], col="red", cex=2, pch=20)


#BIC
plot(reg_full_sumavg$bic, xlab="Number of variables", ylab = "BIC", type="l")
mnbic=which.min(reg_full_sum$bic)
points(mnbic, reg_full_sumavg$bic[mnbic], col="green", pch=20, cex=2)

BIC selects for a smaller model (5 predictors) (heavier penalty)
Adjusted R^2 max is around 5 as well.
As expected, RSS decreases as more predictors are added, but this is not indicative of test error rate.

Plotting Selected Variables

plot(regfit_fullavg, scale="r2")

plot(regfit_fullavg, scale="adjr2")

plot(regfit_fullavg, scale="bic")

Coef of models

"model with highest adjusted R^2"
## [1] "model with highest adjusted R^2"
coef(regfit_fullavg, mxr2) #model with highest adjusted R^2
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##     3.723613e+00    -9.071021e-03     3.961108e-04    -4.025395e-03 
##   MDVP.Jitter... MDVP.Jitter.Abs. 
##     1.320928e+02    -1.933071e+04
"model with lowest CP"
## [1] "model with lowest CP"
coef(regfit_fullavg, mncp) #model with lowest CP
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##     3.657245e+00    -8.664956e-03     2.801145e-04    -4.022328e-03 
##   MDVP.Jitter... MDVP.Jitter.Abs.         MDVP.RAP 
##     1.506663e+02    -1.862220e+04    -3.690324e+01
"model with lowest BIC"
## [1] "model with lowest BIC"
coef(regfit_fullavg, mnbic) #model with lowest BIC
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##     3.723613e+00    -9.071021e-03     3.961108e-04    -4.025395e-03 
##   MDVP.Jitter... MDVP.Jitter.Abs. 
##     1.320928e+02    -1.933071e+04

Ridge Regression & Lasso

x=model.matrix(status~., pd2avg)[,-1] #[,-1] to remove the intercept
y=pd2avg$status
#automatically transforms any qualitative variables into dummy variables

Ridge Regression

#standardizes by default
#alpha=0 ridge regression,  alpha=1  lasso
grid=10^seq(10, -2, length=100) #10 to the power of a sequence -> our lambda values
grid
##   [1] 1.000000e+10 7.564633e+09 5.722368e+09 4.328761e+09 3.274549e+09
##   [6] 2.477076e+09 1.873817e+09 1.417474e+09 1.072267e+09 8.111308e+08
##  [11] 6.135907e+08 4.641589e+08 3.511192e+08 2.656088e+08 2.009233e+08
##  [16] 1.519911e+08 1.149757e+08 8.697490e+07 6.579332e+07 4.977024e+07
##  [21] 3.764936e+07 2.848036e+07 2.154435e+07 1.629751e+07 1.232847e+07
##  [26] 9.326033e+06 7.054802e+06 5.336699e+06 4.037017e+06 3.053856e+06
##  [31] 2.310130e+06 1.747528e+06 1.321941e+06 1.000000e+06 7.564633e+05
##  [36] 5.722368e+05 4.328761e+05 3.274549e+05 2.477076e+05 1.873817e+05
##  [41] 1.417474e+05 1.072267e+05 8.111308e+04 6.135907e+04 4.641589e+04
##  [46] 3.511192e+04 2.656088e+04 2.009233e+04 1.519911e+04 1.149757e+04
##  [51] 8.697490e+03 6.579332e+03 4.977024e+03 3.764936e+03 2.848036e+03
##  [56] 2.154435e+03 1.629751e+03 1.232847e+03 9.326033e+02 7.054802e+02
##  [61] 5.336699e+02 4.037017e+02 3.053856e+02 2.310130e+02 1.747528e+02
##  [66] 1.321941e+02 1.000000e+02 7.564633e+01 5.722368e+01 4.328761e+01
##  [71] 3.274549e+01 2.477076e+01 1.873817e+01 1.417474e+01 1.072267e+01
##  [76] 8.111308e+00 6.135907e+00 4.641589e+00 3.511192e+00 2.656088e+00
##  [81] 2.009233e+00 1.519911e+00 1.149757e+00 8.697490e-01 6.579332e-01
##  [86] 4.977024e-01 3.764936e-01 2.848036e-01 2.154435e-01 1.629751e-01
##  [91] 1.232847e-01 9.326033e-02 7.054802e-02 5.336699e-02 4.037017e-02
##  [96] 3.053856e-02 2.310130e-02 1.747528e-02 1.321941e-02 1.000000e-02
ridge.mod=glmnet(x, y, alpha=0, lambda=grid, family = "binomial")
#rows (predictors), columns (lambda values)
ridge.mod$lambda[50] #lambda value
## [1] 11497.57
coef(ridge.mod)[,50] #coef for this lambda value (index)
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##     1.098296e+00    -3.758310e-07    -1.594538e-07    -4.586503e-07 
##   MDVP.Jitter... MDVP.Jitter.Abs.         MDVP.RAP         MDVP.PPQ 
##     2.825076e-03     4.698392e-01     4.446949e-03     5.073418e-03 
##       Jitter.DDP     MDVP.Shimmer MDVP.Shimmer.dB.     Shimmer.APQ3 
##     1.482471e-03     9.544775e-04     8.826712e-05     1.722701e-03 
##     Shimmer.APQ5         MDVP.APQ      Shimmer.DDA              NHR 
##     1.392087e-03     1.037698e-03     5.742150e-04     2.353659e-04 
##              HNR             RPDE              DFA          spread1 
##    -3.480791e-06     1.358691e-04     1.742388e-04     2.530471e-05 
##          spread2               D2              PPE 
##     2.975963e-04     4.869601e-05     2.849646e-04

Estimating Test Error

ridge.mod=glmnet(x[trainavg,],y[trainavg],alpha=0,lambda=grid, thresh=1e-12, family = "binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
ridge.pred=predict(ridge.mod,s=4,newx=x[testavg,])
mean((ridge.pred-y.testavg)^2)
## Warning in Ops.factor(ridge.pred, y.testavg): '-' not meaningful for factors
## [1] NA
#check vs least squares lambda=0
ridge.pred=predict(ridge.mod, s=0, newx=x[testavg,], exact = T, x=x[trainavg,], y=y[trainavg]) 
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
mean((ridge.pred-y.testavg)^2)
## Warning in Ops.factor(ridge.pred, y.testavg): '-' not meaningful for factors
## [1] NA
lm(y~x, subset = trainavg)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## 
## Call:
## lm(formula = y ~ x, subset = trainavg)
## 
## Coefficients:
##       (Intercept)       xMDVP.Fo.Hz.      xMDVP.Fhi.Hz.      xMDVP.Flo.Hz.  
##         1.616e+01         -1.390e-01          1.053e-02          3.199e-02  
##   xMDVP.Jitter...  xMDVP.Jitter.Abs.          xMDVP.RAP          xMDVP.PPQ  
##         6.457e+01         -7.708e+05         -1.013e+06          4.988e+03  
##       xJitter.DDP      xMDVP.Shimmer  xMDVP.Shimmer.dB.      xShimmer.APQ3  
##         3.390e+05          3.633e+03         -1.177e+01         -7.903e+05  
##     xShimmer.APQ5          xMDVP.APQ       xShimmer.DDA               xNHR  
##        -2.930e+03         -6.484e+02          2.627e+05         -1.010e+01  
##              xHNR              xRPDE               xDFA           xspread1  
##                NA                 NA                 NA                 NA  
##          xspread2                xD2               xPPE  
##                NA                 NA                 NA
predict(ridge.mod,s=0, exact = T, type="coefficients", x=x[trainavg,], y=y[trainavg])[1:20,]
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##     3.982968e+02    -2.515352e-01    -1.176685e-01    -1.135584e-01 
##   MDVP.Jitter... MDVP.Jitter.Abs.         MDVP.RAP         MDVP.PPQ 
##    -8.343540e+02    -1.266549e+06     2.306985e+04    -4.446661e+03 
##       Jitter.DDP     MDVP.Shimmer MDVP.Shimmer.dB.     Shimmer.APQ3 
##     1.403851e+03    -3.894412e+01     3.653470e+01    -5.647350e+02 
##     Shimmer.APQ5         MDVP.APQ      Shimmer.DDA              NHR 
##     3.416166e+02    -4.248073e+02     8.942996e+01    -1.723648e+02 
##              HNR             RPDE              DFA          spread1 
##    -1.050122e+00    -1.443907e+02    -8.948188e+01     3.023055e+01

Cross Validation for Ridge

set.seed(4706)
cv.out=cv.glmnet(x[trainavg,], y[trainavg], alpha=0, family = "binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
plot(cv.out)

bestlam=cv.out$lambda.min
bestlam
## [1] 2.960083
#predict on test using best lambda
ridge.pred=predict(ridge.mod, s=bestlam, newx=x[testavg,])
mean((ridge.pred-y.testavg)^2)
## Warning in Ops.factor(ridge.pred, y.testavg): '-' not meaningful for factors
## [1] NA
#refit on full data
out=glmnet(x,y,alpha=0, family = "binomial")
predict(out,type="coefficients",s=bestlam)[1:20,]
##      (Intercept)      MDVP.Fo.Hz.     MDVP.Fhi.Hz.     MDVP.Flo.Hz. 
##     4.367547e-01    -1.062314e-03    -5.054642e-04    -1.301264e-03 
##   MDVP.Jitter... MDVP.Jitter.Abs.         MDVP.RAP         MDVP.PPQ 
##     5.349053e+00     1.003401e+03     8.588853e+00     9.797759e+00 
##       Jitter.DDP     MDVP.Shimmer MDVP.Shimmer.dB.     Shimmer.APQ3 
##     2.863335e+00     2.090017e+00     1.888996e-01     3.703174e+00 
##     Shimmer.APQ5         MDVP.APQ      Shimmer.DDA              NHR 
##     3.005893e+00     2.305308e+00     1.234277e+00     3.349048e-01 
##              HNR             RPDE              DFA          spread1 
##    -7.265099e-03     2.978426e-01     5.182344e-01     6.769356e-02

Lasso

#Fitting on training data
lasso.mod=glmnet(x[trainavg,], y[trainavg], alpha=1, lambda=grid, family="binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

#Getting best lambda 
set.seed(4706)
cv.out=cv.glmnet(x[trainavg,], y[trainavg], alpha=1, family="binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
plot(cv.out)

bestlam=cv.out$lambda.min #lambda.min=lambda with lowest error
bestlam
## [1] 0.122311
lasso.pred=predict(lasso.mod, s=bestlam, newx=x[testavg,])
mean((lasso.pred!=y.testavg))
## [1] 1
#Refitting on full data using best lambda
out=glmnet(x, y, alpha=1, lambda=grid, family="binomial")
lasso.coef=predict(out, type="coefficients", s=bestlam)
lasso.coef
## 23 x 1 sparse Matrix of class "dgCMatrix"
##                           1
## (Intercept)      7.12316442
## MDVP.Fo.Hz.      .         
## MDVP.Fhi.Hz.     .         
## MDVP.Flo.Hz.     .         
## MDVP.Jitter...   .         
## MDVP.Jitter.Abs. .         
## MDVP.RAP         .         
## MDVP.PPQ         .         
## Jitter.DDP       .         
## MDVP.Shimmer     .         
## MDVP.Shimmer.dB. .         
## Shimmer.APQ3     .         
## Shimmer.APQ5     .         
## MDVP.APQ         .         
## Shimmer.DDA      .         
## NHR              .         
## HNR              .         
## RPDE             .         
## DFA              .         
## spread1          1.02192948
## spread2          0.07698237
## D2               .         
## PPE              .

Lasso: spread1+spread2 Best Subsets: MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.Flo.Hz.+MDVP.Jitter…+MDVP.Jitter.Abs.

Logistic Regression

I fit a logistic regression model on the training data

logist_reg_fit=glm(status~., data=pd2avg, family="binomial", subset=trainavg)

summary(logist_reg_fit)
## 
## Call:
## glm(formula = status ~ ., family = "binomial", data = pd2avg, 
##     subset = trainavg)
## 
## Deviance Residuals: 
##  [1]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 
## Coefficients: (7 not defined because of singularities)
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)       7.498e+02  4.080e+06       0        1
## MDVP.Fo.Hz.      -7.109e+00  4.004e+04       0        1
## MDVP.Fhi.Hz.      5.383e-01  3.485e+03       0        1
## MDVP.Flo.Hz.      1.636e+00  2.153e+04       0        1
## MDVP.Jitter...    3.301e+03  4.742e+08       0        1
## MDVP.Jitter.Abs. -3.941e+07  1.998e+11       0        1
## MDVP.RAP         -5.177e+07  8.979e+11       0        1
## MDVP.PPQ          2.551e+05  1.311e+09       0        1
## Jitter.DDP        1.733e+07  2.999e+11       0        1
## MDVP.Shimmer      1.858e+05  3.789e+09       0        1
## MDVP.Shimmer.dB. -6.020e+02  3.953e+07       0        1
## Shimmer.APQ3     -4.041e+07  2.469e+11       0        1
## Shimmer.APQ5     -1.498e+05  1.352e+09       0        1
## MDVP.APQ         -3.315e+04  1.032e+09       0        1
## Shimmer.DDA       1.343e+07  8.282e+10       0        1
## NHR              -5.163e+02  9.754e+07       0        1
## HNR                      NA         NA      NA       NA
## RPDE                     NA         NA      NA       NA
## DFA                      NA         NA      NA       NA
## spread1                  NA         NA      NA       NA
## spread2                  NA         NA      NA       NA
## D2                       NA         NA      NA       NA
## PPE                      NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.7995e+01  on 15  degrees of freedom
## Residual deviance: 2.5232e-10  on  0  degrees of freedom
## AIC: 32
## 
## Number of Fisher Scoring iterations: 24
#"Coefficients"
#coef(logist_reg_fit)

#"P-Values"
#summary(logist_reg_fit)$coef[ ,4]
alias(logist_reg_fit)
## Model :
## status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + 
##     MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + 
##     MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + 
##     Shimmer.DDA + NHR + HNR + RPDE + DFA + spread1 + spread2 + 
##     D2 + PPE
## 
## Complete :
##         (Intercept)             MDVP.Fo.Hz.             MDVP.Fhi.Hz.           
## HNR                 -82879/5671           -10961/148076        -385325/39128651
## RPDE             -158161/156468             -349/170805         -32835/38997197
## DFA               176655/180827        -137763/38059693                       0
## spread1        17667495/2986322         -383479/2071270             3524/210831
## spread2          871889/1197555            -8563/505119            5638/5059529
## D2             24770678/2053841               -281/8072           14075/1957267
## PPE               137556/148921              -281/19851          21051/17505865
##         MDVP.Flo.Hz.            MDVP.Jitter...          MDVP.Jitter.Abs.       
## HNR                49702/276611        -251418644/43871  -1056184219703/6393458
## RPDE                1942/616161      1064396379/4850359        456427444/983027
## DFA              65500/37454927      -36588967/20966213       -1355726976/79847
## spread1            16466/282463        134301521/132460    -838661367980/876429
## spread2             3833/550609            627524/43743           -62958739/722
## D2                 -3521/441912       860819258/9794683    -216199595311/957652
## PPE               17783/3612099       160541211/2472707       -5202576191/71299
##         MDVP.RAP                MDVP.PPQ                Jitter.DDP             
## HNR         -350953741215/48578       1972495981/418605    1026611379775/425339
## RPDE    -2523318741583/11475549           -860119/20075    331672289147/4522823
## DFA                 2688385/351   13653250099/106635153          -10904904/4339
## spread1        -3470178386/1679         246243181/40573         3116509553/4512
## spread2     -26599422712/111873      1685536360/2499987        3108416869/39122
## D2        1279509244902/2147443            3445279/4022  -1280208734990/6446441
## PPE           -7746462176/42033            1873549/4052        2146996803/34861
##         MDVP.Shimmer            MDVP.Shimmer.dB.        Shimmer.APQ3           
## HNR              148883753/4547        -38096965/164901       19440390249/10610
## RPDE                1094199/926             -37448/1959      82120533305/629368
## DFA            88932442/2968375            -5780/182241    -68720962054/2017171
## spread1           67015907/9493   -8521374812/137541773    -204402519952/256985
## spread2          21575395/17773           -130899/10544     -13486835327/348808
## D2             -176330590/58491          7107040/196841    -331089559541/528989
## PPE            101139656/156693         -1033384/174191         -447412330/8541
##         Shimmer.APQ5            MDVP.APQ                Shimmer.DDA            
## HNR      -133706047907/11078571    -65059252027/7415207        -1153368752/1859
## RPDE             -5034363/27323        -47942681/135105   -101367534598/2311399
## DFA               -775633/11370                3535/522         196046243/17257
## spread1          -11603605/2876     -1551541081/1026898       10014804295/38011
## spread2          -9951190/17517         -18111419/62975     13110949869/1042237
## D2                6406723/35383        279348338/268789        2283910610/10893
## PPE       -15698135687/45933638            -448507/3141       1873408916/108267
##         NHR                    
## HNR               -1394957/1514
## RPDE              -504175/16323
## DFA              -448262/162297
## spread1      -452242799/4523833
## spread2     -569544937/28785675
## D2            141084697/1121771
## PPE                 -11409/1046
contrasts(pd2avg[,"status"])
##   1
## 0 0
## 1 1
glm_probs = predict(logist_reg_fit, pd2avg[testavg,], type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(glm_probs)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01 
## 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01 
##            1            0            1            1            0            0
length(glm_pred)
## [1] 16
length(y.testavg)
## [1] 16
table(glm_pred, y.testavg)
##         y.testavg
## glm_pred 0 1
##        0 2 6
##        1 2 6

Classification Error

mean(glm_pred!=y.testavg)
## [1] 0.5

Accuracy

mean(glm_pred==y.testavg)
## [1] 0.5
colnames(pd2)
##  [1] "MDVP.Fo.Hz."      "MDVP.Fhi.Hz."     "MDVP.Flo.Hz."     "MDVP.Jitter..."  
##  [5] "MDVP.Jitter.Abs." "MDVP.RAP"         "MDVP.PPQ"         "Jitter.DDP"      
##  [9] "MDVP.Shimmer"     "MDVP.Shimmer.dB." "Shimmer.APQ3"     "Shimmer.APQ5"    
## [13] "MDVP.APQ"         "Shimmer.DDA"      "NHR"              "HNR"             
## [17] "RPDE"             "DFA"              "spread1"          "spread2"         
## [21] "D2"               "PPE"              "status"
logist_reg_fit2=glm(status~MDVP.Fhi.Hz.+MDVP.APQ+NHR+RPDE+spread1+D2+PPE, data=pd2avg, family=binomial, subset=trainavg)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logist_reg_fit)
## 
## Call:
## glm(formula = status ~ ., family = "binomial", data = pd2avg, 
##     subset = trainavg)
## 
## Deviance Residuals: 
##  [1]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 
## Coefficients: (7 not defined because of singularities)
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)       7.498e+02  4.080e+06       0        1
## MDVP.Fo.Hz.      -7.109e+00  4.004e+04       0        1
## MDVP.Fhi.Hz.      5.383e-01  3.485e+03       0        1
## MDVP.Flo.Hz.      1.636e+00  2.153e+04       0        1
## MDVP.Jitter...    3.301e+03  4.742e+08       0        1
## MDVP.Jitter.Abs. -3.941e+07  1.998e+11       0        1
## MDVP.RAP         -5.177e+07  8.979e+11       0        1
## MDVP.PPQ          2.551e+05  1.311e+09       0        1
## Jitter.DDP        1.733e+07  2.999e+11       0        1
## MDVP.Shimmer      1.858e+05  3.789e+09       0        1
## MDVP.Shimmer.dB. -6.020e+02  3.953e+07       0        1
## Shimmer.APQ3     -4.041e+07  2.469e+11       0        1
## Shimmer.APQ5     -1.498e+05  1.352e+09       0        1
## MDVP.APQ         -3.315e+04  1.032e+09       0        1
## Shimmer.DDA       1.343e+07  8.282e+10       0        1
## NHR              -5.163e+02  9.754e+07       0        1
## HNR                      NA         NA      NA       NA
## RPDE                     NA         NA      NA       NA
## DFA                      NA         NA      NA       NA
## spread1                  NA         NA      NA       NA
## spread2                  NA         NA      NA       NA
## D2                       NA         NA      NA       NA
## PPE                      NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.7995e+01  on 15  degrees of freedom
## Residual deviance: 2.5232e-10  on  0  degrees of freedom
## AIC: 32
## 
## Number of Fisher Scoring iterations: 24
glm_probs = predict(logist_reg_fit2, pd2[testavg,], type="response")
head(glm_probs)
## phon_R01_S06_1 phon_R01_S02_4 phon_R01_S02_6 phon_R01_S02_1 phon_R01_S01_2 
##              1              1              1              1              1 
## phon_R01_S01_1 
##              1
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S06_1 phon_R01_S02_4 phon_R01_S02_6 phon_R01_S02_1 phon_R01_S01_2 
##              1              1              1              1              1 
## phon_R01_S01_1 
##              1
table(glm_pred, y.testavg)
##         y.testavg
## glm_pred  0  1
##        0  3  0
##        1  1 12
"Error"
## [1] "Error"
mean(glm_pred!=y.testavg)
## [1] 0.0625
"Accuracy"
## [1] "Accuracy"
mean(glm_pred==y.testavg)
## [1] 0.9375

LDA

lda_fit = lda(status~., data=pd2avg[,-5], subset=trainavg)
## Warning in lda.default(x, grouping, ...): variables are collinear
lda_fit
## Call:
## lda(status ~ ., data = pd2avg[, -5], subset = trainavg)
## 
## Prior probabilities of groups:
##    0    1 
## 0.25 0.75 
## 
## Group means:
##   MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...    MDVP.RAP    MDVP.PPQ
## 0    166.4436     221.5269    142.05758    0.003315833 0.001477083 0.001695833
## 1    146.5777     204.6702     99.45608    0.006195972 0.003278056 0.003306905
##    Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5
## 0 0.004432500   0.01602375        0.1435833  0.008297917  0.009444583
## 1 0.009835417   0.03158137        0.2931925  0.016695397  0.018639960
##     MDVP.APQ Shimmer.DDA         NHR      HNR      RPDE       DFA   spread1
## 0 0.01297917  0.02489500 0.005145833 26.33188 0.4559212 0.7122583 -6.873966
## 1 0.02601188  0.05008696 0.029137917 21.33347 0.5212461 0.7051521 -5.482128
##     spread2       D2       PPE
## 0 0.1620567 2.062159 0.1155605
## 1 0.2425816 2.437170 0.2220227
## 
## Coefficients of linear discriminants:
##                            LD1
## MDVP.Fo.Hz.       7.229350e-02
## MDVP.Fhi.Hz.     -3.762147e-02
## MDVP.Flo.Hz.     -1.148403e-01
## MDVP.Jitter...   -1.365518e+03
## MDVP.RAP          9.213403e+00
## MDVP.PPQ          1.128217e+03
## Jitter.DDP        3.393728e+00
## MDVP.Shimmer      3.300002e+01
## MDVP.Shimmer.dB.  8.437520e+00
## Shimmer.APQ3     -3.084104e+02
## Shimmer.APQ5      1.326483e+03
## MDVP.APQ         -7.479707e+02
## Shimmer.DDA      -1.014674e+02
## NHR               9.625385e+01
## HNR               4.064078e-01
## RPDE             -2.087497e+01
## DFA               2.252635e+01
## spread1           9.047795e+00
## spread2           1.562251e+01
## D2               -1.877527e+00
## PPE              -3.901749e+01
plot(lda_fit) #produces plots of the linear discriminants

lda_pred = predict(lda_fit, pd2avg[testavg,])
names(lda_pred) #the names are  class, contains LDA’s predictions
## [1] "class"     "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.testavg) #predictions vs real values of test data
##          y.testavg
## lda_class  0  1
##         0  2  1
##         1  2 11
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
##   1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.testavg)
## [1] 0.8125
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.testavg)
## [1] 0.1875
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100

sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 3
sum(lda_pred$posterior[,1]<0.5)
## [1] 13
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2avg

set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
##  1  2  3  4  5  6  7  8  9 10 
##  3  6  4  1  2  2  5  3  1  5
error=vector()#error for each fold

for (j in 1:k) #j refers to folds
{
  lda_fit = lda(status~., data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold

  lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
  lda_class = lda_pred$class #the class predictions of our fitted model on test data
  error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
cv_error=mean(error)
cv_error
## [1] 0.2233333

Using Predictors Selected by Lasso

lda_fit = lda(status~spread1+spread2, data=pd2avg[,-5], subset=trainavg)
lda_fit
## Call:
## lda(status ~ spread1 + spread2, data = pd2avg[, -5], subset = trainavg)
## 
## Prior probabilities of groups:
##    0    1 
## 0.25 0.75 
## 
## Group means:
##     spread1   spread2
## 0 -6.873966 0.1620567
## 1 -5.482128 0.2425816
## 
## Coefficients of linear discriminants:
##              LD1
## spread1 1.296697
## spread2 3.163055
plot(lda_fit) #produces plots of the linear discriminants

lda_pred = predict(lda_fit, pd2avg[testavg,])
names(lda_pred) #the names are  class, contains LDA’s predictions
## [1] "class"     "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.testavg) #predictions vs real values of test data
##          y.testavg
## lda_class  0  1
##         0  2  1
##         1  2 11
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
##   1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.testavg)
## [1] 0.8125
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.testavg)
## [1] 0.1875
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100

sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 3
sum(lda_pred$posterior[,1]<0.5)
## [1] 13
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2avg

set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
##  1  2  3  4  5  6  7  8  9 10 
##  3  6  4  1  2  2  5  3  1  5
error=vector()#error for each fold

for (j in 1:k) #j refers to folds
{
  lda_fit = lda(status~spread1+spread2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold

  lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
  lda_class = lda_pred$class #the class predictions of our fitted model on test data
  error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}

cv_error=mean(error)
cv_error
## [1] 0.07833333

QDA

#qda_fit=qda(status~., data=pd2avg, subset = trainavg)
#qda_fit

#qda_class=predict(qda_fit, pd2avg[testavg,])$class
#table(qda_class, y.testavg)

#"Accuracy"
#mean(qda_class==y.testavg)

#"Classification Error"
#mean(qda_class!=y.testavg)
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100

Using Predictors Selected by Lasso

#CV for QDA
#k=10 #number of folds
#Dataset=pd2

#set.seed(4706)
#folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
#Dataset=cbind(Dataset, folds)
#table(folds) #to see that they are balanced
#error=vector()#error for each fold

#for (j in 1:k) #j refers to folds
#{
  #qda_fit = qda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold

  #qda_class=predict(qda_fit, Dataset[folds==j,])$class #test data, current fold
  #error=append(error, mean(qda_class!=Dataset[folds==j, "status"])) #cv error
#}

#cv_error=mean(error)
#cv_error

Trees

tree.pd=tree(pd2avg[,"status"]~., pd2avg[,1:22], subset=trainavg)
summary(tree.pd)
## 
## Classification tree:
## tree(formula = pd2avg[, "status"] ~ ., data = pd2avg[, 1:22], 
##     subset = trainavg)
## Variables actually used in tree construction:
## [1] "MDVP.RAP"
## Number of terminal nodes:  2 
## Residual mean deviance:  0.5456 = 7.638 / 14 
## Misclassification error rate: 0.125 = 2 / 16
plot(tree.pd)
text(tree.pd, pretty=0)

Spread1 seems to be the most important predictor here.

tree.pred=predict(tree.pd, pd2avg[testavg,], type="class")
table(tree.pred, y.testavg)
##          y.testavg
## tree.pred  0  1
##         0  2  2
##         1  2 10
"Accuracy"
## [1] "Accuracy"
mean(tree.pred==y.testavg)
## [1] 0.75
"Error"
## [1] "Error"
mean(tree.pred!=y.testavg)
## [1] 0.25

Pruning

The test error rate seems good. We consider pruning the tree.

set.seed(4706)
cv.pd=cv.tree(tree.pd, FUN=prune.misclass)
names(cv.pd)
## [1] "size"   "dev"    "k"      "method"
cv.pd
## $size
## [1] 2 1
## 
## $dev
## [1] 6 7
## 
## $k
## [1] -Inf    2
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
par(mfrow=c(1,2))
plot(cv.pd$size, cv.pd$dev, type="b", ylab="cv error")
plot(cv.pd$k, cv.pd$dev, type="b", ylab = "cv error")

Prune the tree and obtain the 2-node tree

pruned.pd=prune.misclass(tree.pd, best=2)
plot(pruned.pd)
text(pruned.pd, pretty=0)

pruned.pred=predict(pruned.pd, pd2avg[testavg,], type="class")
table(pruned.pred, y.testavg)
##            y.testavg
## pruned.pred  0  1
##           0  2  2
##           1  2 10
"Accuracy"
## [1] "Accuracy"
mean(pruned.pred==y.testavg)
## [1] 0.75
"Error"
## [1] "Error"
mean(pruned.pred!=y.testavg)
## [1] 0.25

The unpruned tree actually performed better than the pruned tree.

Bagging and Random Forest

BAGGING

#bagging
set.seed(4706)
bag.pd=randomForest(pd2avg[,"status"]~., pd2avg[,1:22], subset=trainavg, mtry=13, importance=TRUE)
#mtry: Number of variables randomly sampled as candidates at each split. Note that the default values are different for classification (sqrt(p) where p is number of variables in x) and regression (p/3)
bag.pd
## 
## Call:
##  randomForest(formula = pd2avg[, "status"] ~ ., data = pd2avg[,      1:22], mtry = 13, importance = TRUE, subset = trainavg) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 13
## 
##         OOB estimate of  error rate: 18.75%
## Confusion matrix:
##   0  1 class.error
## 0 2  2  0.50000000
## 1 1 11  0.08333333
plot(bag.pd)

yhat.bag=predict(bag.pd, newdata = pd2avg[testavg,])
plot(yhat.bag, y.testavg, xlab="bagging pred", ylab="actual")

"Error"
## [1] "Error"
mean(yhat.bag!=y.testavg)
## [1] 0.1875
"Accuracy"
## [1] "Accuracy"
mean(yhat.bag==y.testavg)
## [1] 0.8125

RANDOM FOREST

set.seed(4706)
rf.pd=randomForest(pd2avg[,"status"]~., pd2avg[,1:22], subset=trainavg, mtry=6, importance=TRUE)
yhat.rf=predict(rf.pd, newdata = pd2avg[testavg,])
rf.pd
## 
## Call:
##  randomForest(formula = pd2avg[, "status"] ~ ., data = pd2avg[,      1:22], mtry = 6, importance = TRUE, subset = trainavg) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 18.75%
## Confusion matrix:
##   0  1 class.error
## 0 2  2  0.50000000
## 1 1 11  0.08333333
plot(rf.pd)

"Error"
## [1] "Error"
mean(yhat.rf!=y.testavg)
## [1] 0.1875
"Accuracy"
## [1] "Accuracy"
mean(yhat.rf==y.testavg)
## [1] 0.8125
importance(rf.pd)
##                           0          1 MeanDecreaseAccuracy MeanDecreaseGini
## MDVP.Fo.Hz.       0.0000000 -1.7815347           -1.5904517       0.09742812
## MDVP.Fhi.Hz.      0.0000000 -1.9219142           -1.9590384       0.10147491
## MDVP.Flo.Hz.      0.9052750 -0.8170415           -0.1455245       0.06940220
## MDVP.Jitter...   -0.4473031 -0.2018101           -0.3513199       0.06910079
## MDVP.Jitter.Abs.  1.5753235  1.4027521            1.6479090       0.08396429
## MDVP.RAP          4.3303869  3.5541055            4.2763050       0.63893095
## MDVP.PPQ          3.1495376  2.5798419            3.0042845       0.24788533
## Jitter.DDP        4.2965188  2.9975250            3.9475962       0.74593191
## MDVP.Shimmer      1.1353541 -0.8093464           -0.2458390       0.12854286
## MDVP.Shimmer.dB.  0.4927843  1.3850664            1.2311200       0.28830539
## Shimmer.APQ3      1.3440623  1.6477889            2.1561750       0.08777047
## Shimmer.APQ5      0.8256994  1.7337508            1.8069892       0.19978665
## MDVP.APQ          1.2437604 -2.0813379           -1.3907097       0.28018105
## Shimmer.DDA      -1.0010015 -0.5346753           -0.7146504       0.06769658
## NHR               3.4916476  2.0107387            2.7937183       0.71689267
## HNR              -1.0010015 -0.3358583           -0.7477619       0.10969811
## RPDE             -1.7372705 -1.0010015           -1.6925888       0.03998810
## DFA              -1.7026641 -1.2703560           -1.6412227       0.05978581
## spread1           3.4975234  3.2373497            3.7180401       0.40137408
## spread2           1.4170505 -1.1150545           -0.3283415       0.18264000
## D2                3.2717921  1.6619264            2.7608042       0.62698535
## PPE               4.3453977  4.0778719            4.4180393       0.47423438

Boosting

#ibrary(gbm)
#gbm() with the option distribution="gaussian" since this is a regression problem; if it were a bi- nary classification problem, we would use distribution="bernoulli". The argument n.trees=5000 indicates that we want 5000 trees, and the option interaction.depth=4 limits the depth of each tree

#set.seed(4706)

#boost.pd=gbm(pd2avg[trainavg,"status"]~., pd2avg[trainavg,1:22], distribution = "gaussian", n.trees=5000, interaction.depth=4)
#shrinkage=0.001 by default
#can change it
#summary(boost.pd)
#par(mfrow=c(1,2))
#plot(boost.pd, i="spread1")
#plot(boost.pd, i="MDVP.Fo.Hz.")

P-values are all very large

contrasts(pd2avg[,"status"])
##   1
## 0 0
## 1 1
glm_probsavg = predict(logist_reg_fit, pd2avg[testavg,], type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(glm_probsavg)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01 
## 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
glm_predavg=ifelse(glm_probsavg>0.5, 1, 0)
head(glm_predavg)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01 
##            1            0            1            1            0            0
length(glm_predavg)
## [1] 16
length(y.testavg)
## [1] 16
table(glm_predavg, y.testavg)
##            y.testavg
## glm_predavg 0 1
##           0 2 6
##           1 2 6

Classification Error

mean(glm_predavg!=y.testavg)
## [1] 0.5

Accuracy

mean(glm_predavg==y.testavg)
## [1] 0.5

Logistic Regression 0.1752577 LDA (full) 0.1172264 QDA (full) 0.1384934 Tree (full) 0.1443299 Tree (pruned) 0.1649485 Bagging 0.07216495 Random Forest 0.08247423